(stormcrawler) 01/02: Introduces TextExtractor as interface. Renames previous TextExtractor to JSoupTextExtractor. Allow configuration of text extractor used via config properties.

rzo1 Thu, 12 Jun 2025 07:56:23 -0700

This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch genai-text-extractor
in repository https://gitbox.apache.org/repos/asf/stormcrawler.git


commit 4734256c62fec54236e16490cbc613fb1e5de7b6
Author: Richard Zowalal <[email protected]>
AuthorDate: Thu Jun 12 15:17:15 2025 +0200

    Introduces TextExtractor as interface. Renames previous TextExtractor to 
JSoupTextExtractor. Allow configuration of text extractor used via config 
properties.
---
 .../apache/stormcrawler/bolt/JSoupParserBolt.java  |  25 ++-
 ...{TextExtractor.java => JSoupTextExtractor.java} |  65 +++---
 .../apache/stormcrawler/parse/TextExtractor.java   | 219 +--------------------
 core/src/main/resources/crawler-default.yaml       |   1 +
 ...ractorTest.java => JSoupTextExtractorTest.java} |  22 +--
 5 files changed, 72 insertions(+), 260 deletions(-)

diff --git 
a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java 
b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
index 01d9d797..b6751914 100644
--- a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
+++ b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
@@ -21,6 +21,7 @@ import static 
org.apache.stormcrawler.Constants.StatusStreamName;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.lang.reflect.InvocationTargetException;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.nio.ByteBuffer;
@@ -44,6 +45,7 @@ import org.apache.stormcrawler.Metadata;
 import org.apache.stormcrawler.parse.DocumentFragmentBuilder;
 import org.apache.stormcrawler.parse.JSoupFilter;
 import org.apache.stormcrawler.parse.JSoupFilters;
+import org.apache.stormcrawler.parse.JSoupTextExtractor;
 import org.apache.stormcrawler.parse.Outlink;
 import org.apache.stormcrawler.parse.ParseData;
 import org.apache.stormcrawler.parse.ParseFilter;
@@ -161,7 +163,28 @@ public class JSoupParserBolt extends StatusEmitterBolt {
         ignoreMetaRedirections =
                 ConfUtils.getBoolean(conf, "jsoup.ignore.meta.redirections", 
false);
 
-        textExtractor = new TextExtractor(conf);
+        final String clazz =
+                ConfUtils.getString(
+                        conf, "textextractor.class", 
JSoupTextExtractor.class.getName());
+        try {
+            textExtractor =
+                    (TextExtractor)
+                            Class.forName(clazz)
+                                    .getDeclaredConstructor(Map.class)
+                                    .newInstance(conf);
+        } catch (ClassNotFoundException e) {
+            LOG.warn("Could not load configured textextractor.class '{}'.", 
clazz, e);
+            throw new RuntimeException(e);
+        } catch (NoSuchMethodException e) {
+            LOG.warn(
+                    "Configured textextractor.class '{}' does not provide a 
Map argument constructor.",
+                    clazz,
+                    e);
+            throw new RuntimeException(e);
+        } catch (InvocationTargetException | InstantiationException | 
IllegalAccessException e) {
+            LOG.warn("Cannot instantiazr textextractor.class '{}'.", clazz, e);
+            throw new RuntimeException(e);
+        }
     }
 
     @Override
diff --git 
a/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java 
b/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java
similarity index 83%
copy from core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
copy to core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java
index 99fdd16c..55b9279f 100644
--- a/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
+++ b/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java
@@ -20,6 +20,7 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Set;
 import org.apache.stormcrawler.util.ConfUtils;
 import org.jetbrains.annotations.Contract;
 import org.jsoup.helper.Validate;
@@ -57,7 +58,7 @@ import org.jsoup.select.NodeVisitor;
  *
  * @since 1.13
  */
-public class TextExtractor {
+public class JSoupTextExtractor implements TextExtractor {
 
     public static final String INCLUDE_PARAM_NAME = 
"textextractor.include.pattern";
     public static final String EXCLUDE_PARAM_NAME = 
"textextractor.exclude.tags";
@@ -65,53 +66,57 @@ public class TextExtractor {
     public static final String TEXT_MAX_TEXT_PARAM_NAME = 
"textextractor.skip.after";
 
     private final List<String> inclusionPatterns;
-    private final HashSet<String> excludedTags;
+    private final Set<String> excludedTags;
     private final boolean noText;
     private final int maxTextSize;
 
-    public TextExtractor(Map<String, Object> stormConf) {
+    public JSoupTextExtractor(Map<String, Object> stormConf) {
         maxTextSize = ConfUtils.getInt(stormConf, TEXT_MAX_TEXT_PARAM_NAME, 
-1);
         noText = ConfUtils.getBoolean(stormConf, NO_TEXT_PARAM_NAME, false);
         inclusionPatterns = ConfUtils.loadListFromConf(INCLUDE_PARAM_NAME, 
stormConf);
-        excludedTags = new HashSet<String>();
+        excludedTags = new HashSet<>();
         ConfUtils.loadListFromConf(EXCLUDE_PARAM_NAME, stormConf)
                 .forEach((s) -> excludedTags.add(s.toLowerCase(Locale.ROOT)));
     }
 
-    public String text(Element element) {
+    public String text(Object o) {
         // not interested in getting any text?
         if (noText) return "";
 
-        final StringBuilder accum = new StringBuilder();
+        if (o instanceof Element element) {
 
-        // no patterns at all - return the text from the whole document
-        if (inclusionPatterns.size() == 0 && excludedTags.size() == 0) {
-            _text(element, accum);
-        } else {
-            Elements matches = new Elements();
+            final StringBuilder accum = new StringBuilder();
 
-            for (String pattern : inclusionPatterns) {
-                matches = element.select(pattern);
-                if (!matches.isEmpty()) {
-                    break;
+            // no patterns at all - return the text from the whole document
+            if (inclusionPatterns.size() == 0 && excludedTags.size() == 0) {
+                text(element, accum);
+            } else {
+                Elements matches = new Elements();
+
+                for (String pattern : inclusionPatterns) {
+                    matches = element.select(pattern);
+                    if (!matches.isEmpty()) {
+                        break;
+                    }
                 }
-            }
 
-            // if nothing matches or no patterns were defined use the whole doc
-            if (matches.isEmpty()) {
-                matches.add(element);
-            }
+                // if nothing matches or no patterns were defined use the 
whole doc
+                if (matches.isEmpty()) {
+                    matches.add(element);
+                }
 
-            for (Element node : matches) {
-                _text(node, accum);
-                accum.append("\n");
+                for (Element node : matches) {
+                    text(node, accum);
+                    accum.append("\n");
+                }
             }
-        }
 
-        return accum.toString().trim();
+            return accum.toString().trim();
+        }
+        return "";
     }
 
-    private void _text(Node node, final StringBuilder accum) {
+    private void text(Node node, final StringBuilder accum) {
         traverse(
                 new NodeVisitor() {
 
@@ -157,8 +162,7 @@ public class TextExtractor {
      * @param visitor Node visitor.
      * @param root the root node point to traverse.
      */
-    public static void traverse(
-            NodeVisitor visitor, Node root, int maxSize, StringBuilder 
builder) {
+    private void traverse(NodeVisitor visitor, Node root, int maxSize, 
StringBuilder builder) {
         Validate.notNull(visitor, "null visitor in traverse method");
         Validate.notNull(root, "null root node in traverse method");
         Node node = root;
@@ -219,8 +223,7 @@ public class TextExtractor {
         if (node == null) return false;
         // looks only at this element and five levels up, to prevent recursion 
&
         // needless stack searches
-        if (node instanceof Element) {
-            Element el = (Element) node;
+        if (node instanceof Element el) {
             int i = 0;
             do {
                 if (el.tag().preserveWhitespace()) return true;
@@ -232,6 +235,6 @@ public class TextExtractor {
     }
 
     static boolean lastCharIsWhitespace(StringBuilder sb) {
-        return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' ';
+        return !sb.isEmpty() && sb.charAt(sb.length() - 1) == ' ';
     }
 }
diff --git 
a/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java 
b/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
index 99fdd16c..c9678bd2 100644
--- a/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
+++ b/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
@@ -16,222 +16,7 @@
  */
 package org.apache.stormcrawler.parse;
 
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import org.apache.stormcrawler.util.ConfUtils;
-import org.jetbrains.annotations.Contract;
-import org.jsoup.helper.Validate;
-import org.jsoup.internal.StringUtil;
-import org.jsoup.nodes.CDataNode;
-import org.jsoup.nodes.Element;
-import org.jsoup.nodes.Node;
-import org.jsoup.nodes.TextNode;
-import org.jsoup.select.Elements;
-import org.jsoup.select.NodeVisitor;
+public interface TextExtractor {
 
-/**
- * Filters the text extracted from HTML documents, used by JSoupParserBolt. 
Configured with optional
- * inclusion patterns based on <a 
href="https://jsoup.org/cookbook/extracting-data/selector-syntax";>
- * JSoup selectors</a>, as well as a list of tags to be excluded.
- *
- * <p>Replaces ContentFilter.
- *
- * <p>The first matching inclusion pattern is used or the whole document if no 
expressions are
- * configured or no match has been found.
- *
- * <p>The TextExtraction can be configured as so:
- *
- * <pre>{@code
- * textextractor.include.pattern:
- *  - DIV[id="maincontent"]
- *  - DIV[itemprop="articleBody"]
- *  - ARTICLE
- *
- * textextractor.exclude.tags:
- *  - STYLE
- *  - SCRIPT
- *
- * }</pre>
- *
- * @since 1.13
- */
-public class TextExtractor {
-
-    public static final String INCLUDE_PARAM_NAME = 
"textextractor.include.pattern";
-    public static final String EXCLUDE_PARAM_NAME = 
"textextractor.exclude.tags";
-    public static final String NO_TEXT_PARAM_NAME = "textextractor.no.text";
-    public static final String TEXT_MAX_TEXT_PARAM_NAME = 
"textextractor.skip.after";
-
-    private final List<String> inclusionPatterns;
-    private final HashSet<String> excludedTags;
-    private final boolean noText;
-    private final int maxTextSize;
-
-    public TextExtractor(Map<String, Object> stormConf) {
-        maxTextSize = ConfUtils.getInt(stormConf, TEXT_MAX_TEXT_PARAM_NAME, 
-1);
-        noText = ConfUtils.getBoolean(stormConf, NO_TEXT_PARAM_NAME, false);
-        inclusionPatterns = ConfUtils.loadListFromConf(INCLUDE_PARAM_NAME, 
stormConf);
-        excludedTags = new HashSet<String>();
-        ConfUtils.loadListFromConf(EXCLUDE_PARAM_NAME, stormConf)
-                .forEach((s) -> excludedTags.add(s.toLowerCase(Locale.ROOT)));
-    }
-
-    public String text(Element element) {
-        // not interested in getting any text?
-        if (noText) return "";
-
-        final StringBuilder accum = new StringBuilder();
-
-        // no patterns at all - return the text from the whole document
-        if (inclusionPatterns.size() == 0 && excludedTags.size() == 0) {
-            _text(element, accum);
-        } else {
-            Elements matches = new Elements();
-
-            for (String pattern : inclusionPatterns) {
-                matches = element.select(pattern);
-                if (!matches.isEmpty()) {
-                    break;
-                }
-            }
-
-            // if nothing matches or no patterns were defined use the whole doc
-            if (matches.isEmpty()) {
-                matches.add(element);
-            }
-
-            for (Element node : matches) {
-                _text(node, accum);
-                accum.append("\n");
-            }
-        }
-
-        return accum.toString().trim();
-    }
-
-    private void _text(Node node, final StringBuilder accum) {
-        traverse(
-                new NodeVisitor() {
-
-                    private Node excluded = null;
-
-                    public void head(Node node, int depth) {
-                        if (excluded == null && node instanceof TextNode) {
-                            TextNode textNode = (TextNode) node;
-                            appendNormalisedText(accum, textNode);
-                        } else if (node instanceof Element) {
-                            Element element = (Element) node;
-                            if (excludedTags.contains(element.tagName())) {
-                                excluded = element;
-                            }
-                            if (accum.length() > 0
-                                    && (element.isBlock() || 
element.tag().getName().equals("br"))
-                                    && !lastCharIsWhitespace(accum)) 
accum.append(' ');
-                        }
-                    }
-
-                    public void tail(Node node, int depth) {
-                        // make sure there is a space between block tags and 
immediately
-                        // following text nodes <div>One</div>Two should be 
"One Two".
-                        if (node instanceof Element) {
-                            Element element = (Element) node;
-                            if (element == excluded) {
-                                excluded = null;
-                            }
-                            if (element.isBlock()
-                                    && (node.nextSibling() instanceof TextNode)
-                                    && !lastCharIsWhitespace(accum)) 
accum.append(' ');
-                        }
-                    }
-                },
-                node,
-                maxTextSize,
-                accum);
-    }
-
-    /**
-     * Start a depth-first traverse of the root and all of its descendants.
-     *
-     * @param visitor Node visitor.
-     * @param root the root node point to traverse.
-     */
-    public static void traverse(
-            NodeVisitor visitor, Node root, int maxSize, StringBuilder 
builder) {
-        Validate.notNull(visitor, "null visitor in traverse method");
-        Validate.notNull(root, "null root node in traverse method");
-        Node node = root;
-        int depth = 0;
-
-        while (node != null) {
-            // interrupts if too much text has already been produced
-            if (maxSize > 0 && builder.length() >= maxSize) return;
-
-            Node parent =
-                    node.parentNode(); // remember parent to find nodes that 
get replaced in .head
-            int origSize = parent != null ? parent.childNodeSize() : 0;
-            Node next = node.nextSibling();
-
-            visitor.head(node, depth); // visit current node
-            if (parent != null && !node.hasParent()) { // removed or replaced
-                if (origSize == parent.childNodeSize()) { // replaced
-                    node =
-                            parent.childNode(
-                                    node.siblingIndex()); // replace ditches 
parent but keeps
-                    // sibling index
-                } else { // removed
-                    node = next;
-                    if (node == null) { // last one, go up
-                        node = parent;
-                        depth--;
-                    }
-                    continue; // don't tail removed
-                }
-            }
-
-            if (node.childNodeSize() > 0) { // descend
-                node = node.childNode(0);
-                depth++;
-            } else {
-                // when no more siblings, ascend
-                while (node.nextSibling() == null && depth > 0) {
-                    visitor.tail(node, depth);
-                    node = node.parentNode();
-                    depth--;
-                }
-                visitor.tail(node, depth);
-                if (node == root) break;
-                node = node.nextSibling();
-            }
-        }
-    }
-
-    private static void appendNormalisedText(final StringBuilder accum, final 
TextNode textNode) {
-        final String text = textNode.getWholeText();
-        if (textNode instanceof CDataNode || 
preserveWhitespace(textNode.parent()))
-            accum.append(text);
-        else StringUtil.appendNormalisedWhitespace(accum, text, 
lastCharIsWhitespace(accum));
-    }
-
-    @Contract("null -> false")
-    static boolean preserveWhitespace(Node node) {
-        if (node == null) return false;
-        // looks only at this element and five levels up, to prevent recursion 
&
-        // needless stack searches
-        if (node instanceof Element) {
-            Element el = (Element) node;
-            int i = 0;
-            do {
-                if (el.tag().preserveWhitespace()) return true;
-                el = el.parent();
-                i++;
-            } while (i < 6 && el != null);
-        }
-        return false;
-    }
-
-    static boolean lastCharIsWhitespace(StringBuilder sb) {
-        return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' ';
-    }
+    String text(Object element);
 }
diff --git a/core/src/main/resources/crawler-default.yaml 
b/core/src/main/resources/crawler-default.yaml
index 80ca74ed..27021730 100644
--- a/core/src/main/resources/crawler-default.yaml
+++ b/core/src/main/resources/crawler-default.yaml
@@ -244,6 +244,7 @@ config:
   detect.mimetype: true
   detect.charset.maxlength: 10000
 
+  #textextractor.class: "org.apache.stormcrawler.parse.JSoupTextExtractor"
   textextractor.skip.after: -1
 
   # filters URLs in sitemaps based on their modified Date (if any)
diff --git 
a/core/src/test/java/org/apache/stormcrawler/parse/TextExtractorTest.java 
b/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java
similarity index 80%
rename from 
core/src/test/java/org/apache/stormcrawler/parse/TextExtractorTest.java
rename to 
core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java
index a5b00662..288cdc8b 100644
--- a/core/src/test/java/org/apache/stormcrawler/parse/TextExtractorTest.java
+++ 
b/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java
@@ -27,13 +27,13 @@ import org.jsoup.nodes.Document;
 import org.jsoup.parser.Parser;
 import org.junit.jupiter.api.Test;
 
-class TextExtractorTest {
+class JSoupTextExtractorTest {
 
     @Test
     void testMainContent() throws IOException {
         Config conf = new Config();
-        conf.put(TextExtractor.INCLUDE_PARAM_NAME, "DIV[id=\"maincontent\"]");
-        TextExtractor extractor = new TextExtractor(conf);
+        conf.put(JSoupTextExtractor.INCLUDE_PARAM_NAME, 
"DIV[id=\"maincontent\"]");
+        JSoupTextExtractor extractor = new JSoupTextExtractor(conf);
         String content =
                 "<html>the<div id='maincontent'>main<div>content</div></div>of 
the page</html>";
         Document jsoupDoc = Parser.htmlParser().parseInput(content, 
"http://stormcrawler.net";);
@@ -44,8 +44,8 @@ class TextExtractorTest {
     @Test
     void testExclusion() throws IOException {
         Config conf = new Config();
-        conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "STYLE");
-        TextExtractor extractor = new TextExtractor(conf);
+        conf.put(JSoupTextExtractor.EXCLUDE_PARAM_NAME, "STYLE");
+        JSoupTextExtractor extractor = new JSoupTextExtractor(conf);
         String content = "<html>the<style>main</style>content of the 
page</html>";
         Document jsoupDoc = Parser.htmlParser().parseInput(content, 
"http://stormcrawler.net";);
         String text = extractor.text(jsoupDoc.body());
@@ -55,8 +55,8 @@ class TextExtractorTest {
     @Test
     void testExclusionCase() throws IOException {
         Config conf = new Config();
-        conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "style");
-        TextExtractor extractor = new TextExtractor(conf);
+        conf.put(JSoupTextExtractor.EXCLUDE_PARAM_NAME, "style");
+        JSoupTextExtractor extractor = new JSoupTextExtractor(conf);
         String content = "<html>the<STYLE>main</STYLE>content of the 
page</html>";
         Document jsoupDoc = Parser.htmlParser().parseInput(content, 
"http://stormcrawler.net";);
         String text = extractor.text(jsoupDoc.body());
@@ -68,14 +68,14 @@ class TextExtractorTest {
         Config conf = new Config();
         List<String> listinc = new LinkedList<>();
         listinc.add("ARTICLE");
-        conf.put(TextExtractor.INCLUDE_PARAM_NAME, listinc);
+        conf.put(JSoupTextExtractor.INCLUDE_PARAM_NAME, listinc);
         List<String> listex = new LinkedList<>();
         listex.add("STYLE");
         listex.add("SCRIPT");
-        conf.put(TextExtractor.EXCLUDE_PARAM_NAME, listex);
+        conf.put(JSoupTextExtractor.EXCLUDE_PARAM_NAME, listex);
         // set a limit
-        conf.put(TextExtractor.TEXT_MAX_TEXT_PARAM_NAME, 5123900);
-        TextExtractor extractor = new TextExtractor(conf);
+        conf.put(JSoupTextExtractor.TEXT_MAX_TEXT_PARAM_NAME, 5123900);
+        JSoupTextExtractor extractor = new JSoupTextExtractor(conf);
         String filename = "longtext.html";
         Document jsoupDoc =
                 Jsoup.parse(

(stormcrawler) 01/02: Introduces TextExtractor as interface. Renames previous TextExtractor to JSoupTextExtractor. Allow configuration of text extractor used via config properties.

Reply via email to