(stormcrawler) 01/01: #1558 - Add a LLM-based TextExtractor (OpenAI API compatible)

rzo1 Fri, 13 Jun 2025 00:09:49 -0700

This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch genai-text-extractor
in repository https://gitbox.apache.org/repos/asf/stormcrawler.git


commit a844db1e66651ea145e7d30bb58a739404488316
Author: Richard Zowalla <[email protected]>
AuthorDate: Thu Jun 12 15:17:15 2025 +0200

    #1558 - Add a LLM-based TextExtractor (OpenAI API compatible)
---
 THIRD-PARTY.txt                                    |   9 +-
 .../apache/stormcrawler/bolt/JSoupParserBolt.java  |  25 ++-
 ...{TextExtractor.java => JSoupTextExtractor.java} |  70 ++++---
 .../apache/stormcrawler/parse/TextExtractor.java   | 222 +--------------------
 core/src/main/resources/crawler-default.yaml       |   1 +
 ...ractorTest.java => JSoupTextExtractorTest.java} |  22 +-
 external/ai/README.md                              |  85 ++++++++
 external/ai/ai-conf.yaml                           |  34 ++++
 external/ai/pom.xml                                |  64 ++++++
 .../stormcrawler/ai/AbstractLLMTextExtractor.java  | 162 +++++++++++++++
 .../stormcrawler/ai/OpenAITextExtractor.java       |  83 ++++++++
 .../ai/listener/LlmResponseListener.java           |  42 ++++
 .../stormcrawler/ai/listener/NoOpListener.java     |  47 +++++
 .../ai/src/main/resources/llm-default-prompt.txt   |  44 ++++
 .../stormcrawler/ai/OpenAITextExtractorTest.java   |  73 +++++++
 external/ai/src/test/resources/stormcrawler.html   | 118 +++++++++++
 external/pom.xml                                   |   2 +-
 pom.xml                                            |   8 +
 18 files changed, 843 insertions(+), 268 deletions(-)

diff --git a/THIRD-PARTY.txt b/THIRD-PARTY.txt
index 7ecc6bd6..3e4368fd 100644
--- a/THIRD-PARTY.txt
+++ b/THIRD-PARTY.txt
@@ -159,9 +159,6 @@ List of third-party dependencies grouped by their license 
type.
         * Jackcess Encrypt 
(com.healthmarketscience.jackcess:jackcess-encrypt:4.0.3 - 
http://jackcessencrypt.sf.net)
         * Jackson-annotations 
(com.fasterxml.jackson.core:jackson-annotations:2.18.1 - 
https://github.com/FasterXML/jackson)
         * Jackson-core (com.fasterxml.jackson.core:jackson-core:2.18.1 - 
https://github.com/FasterXML/jackson-core)
-        * Jackson-core (com.fasterxml.jackson.core:jackson-core:2.18.2 - 
https://github.com/FasterXML/jackson-core)
-        * Jackson-core (com.fasterxml.jackson.core:jackson-core:2.18.3 - 
https://github.com/FasterXML/jackson-core)
-        * Jackson-core (com.fasterxml.jackson.core:jackson-core:2.19.0 - 
https://github.com/FasterXML/jackson-core)
         * jackson-databind (com.fasterxml.jackson.core:jackson-databind:2.18.1 
- https://github.com/FasterXML/jackson)
         * Jackson dataformat: CBOR 
(com.fasterxml.jackson.dataformat:jackson-dataformat-cbor:2.17.2 - 
https://github.com/FasterXML/jackson-dataformats-binary)
         * Jackson dataformat: CBOR 
(com.fasterxml.jackson.dataformat:jackson-dataformat-cbor:2.18.2 - 
https://github.com/FasterXML/jackson-dataformats-binary)
@@ -181,6 +178,11 @@ List of third-party dependencies grouped by their license 
type.
         * Kotlin Stdlib Common 
(org.jetbrains.kotlin:kotlin-stdlib-common:1.9.10 - https://kotlinlang.org/)
         * Kotlin Stdlib Jdk7 (org.jetbrains.kotlin:kotlin-stdlib-jdk7:1.8.21 - 
https://kotlinlang.org/)
         * Kotlin Stdlib Jdk8 (org.jetbrains.kotlin:kotlin-stdlib-jdk8:1.8.21 - 
https://kotlinlang.org/)
+        * LangChain4j :: Core (dev.langchain4j:langchain4j-core:1.0.1 - 
https://github.com/langchain4j/langchain4j/tree/main/langchain4j-core)
+        * LangChain4j :: HTTP Client :: JDK HttpClient 
(dev.langchain4j:langchain4j-http-client-jdk:1.0.1 - 
https://github.com/langchain4j/langchain4j/tree/main/langchain4j-http-client-jdk)
+        * LangChain4j :: HTTP Client 
(dev.langchain4j:langchain4j-http-client:1.0.1 - 
https://github.com/langchain4j/langchain4j/tree/main/langchain4j-http-client)
+        * LangChain4j :: Integration :: OpenAI 
(dev.langchain4j:langchain4j-open-ai:1.0.1 - 
https://github.com/langchain4j/langchain4j/tree/main/langchain4j-open-ai)
+        * LangChain4j (dev.langchain4j:langchain4j:1.0.1 - 
https://github.com/langchain4j/langchain4j/tree/main/langchain4j)
         * lang-mustache (org.opensearch.plugin:lang-mustache-client:2.19.1 - 
https://github.com/opensearch-project/OpenSearch.git)
         * language-detector 
(com.optimaize.languagedetector:language-detector:0.6 - 
https://github.com/optimaize/language-detector)
         * Log4j Implemented Over SLF4J (org.slf4j:log4j-over-slf4j:2.0.17 - 
http://www.slf4j.org)
@@ -360,6 +362,7 @@ List of third-party dependencies grouped by their license 
type.
         * Joni (org.jruby.joni:joni:2.2.1 - 
http://nexus.sonatype.org/oss-repository-hosting.html/joni)
         * JOpt Simple (net.sf.jopt-simple:jopt-simple:5.0.4 - 
http://jopt-simple.github.io/jopt-simple)
         * jsoup Java HTML Parser (org.jsoup:jsoup:1.20.1 - https://jsoup.org/)
+        * JTokkit (com.knuddels:jtokkit:1.1.0 - 
https://github.com/knuddelsgmbh/jtokkit)
         * org.brotli:dec (org.brotli:dec:0.1.2 - http://brotli.org/dec)
         * semver4j (org.semver4j:semver4j:5.3.0 - 
https://github.com/semver4j/semver4j)
         * SLF4J API Module (org.slf4j:slf4j-api:1.7.36 - http://www.slf4j.org)
diff --git 
a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java 
b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
index 01d9d797..b6751914 100644
--- a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
+++ b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
@@ -21,6 +21,7 @@ import static 
org.apache.stormcrawler.Constants.StatusStreamName;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.lang.reflect.InvocationTargetException;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.nio.ByteBuffer;
@@ -44,6 +45,7 @@ import org.apache.stormcrawler.Metadata;
 import org.apache.stormcrawler.parse.DocumentFragmentBuilder;
 import org.apache.stormcrawler.parse.JSoupFilter;
 import org.apache.stormcrawler.parse.JSoupFilters;
+import org.apache.stormcrawler.parse.JSoupTextExtractor;
 import org.apache.stormcrawler.parse.Outlink;
 import org.apache.stormcrawler.parse.ParseData;
 import org.apache.stormcrawler.parse.ParseFilter;
@@ -161,7 +163,28 @@ public class JSoupParserBolt extends StatusEmitterBolt {
         ignoreMetaRedirections =
                 ConfUtils.getBoolean(conf, "jsoup.ignore.meta.redirections", 
false);
 
-        textExtractor = new TextExtractor(conf);
+        final String clazz =
+                ConfUtils.getString(
+                        conf, "textextractor.class", 
JSoupTextExtractor.class.getName());
+        try {
+            textExtractor =
+                    (TextExtractor)
+                            Class.forName(clazz)
+                                    .getDeclaredConstructor(Map.class)
+                                    .newInstance(conf);
+        } catch (ClassNotFoundException e) {
+            LOG.warn("Could not load configured textextractor.class '{}'.", 
clazz, e);
+            throw new RuntimeException(e);
+        } catch (NoSuchMethodException e) {
+            LOG.warn(
+                    "Configured textextractor.class '{}' does not provide a 
Map argument constructor.",
+                    clazz,
+                    e);
+            throw new RuntimeException(e);
+        } catch (InvocationTargetException | InstantiationException | 
IllegalAccessException e) {
+            LOG.warn("Cannot instantiazr textextractor.class '{}'.", clazz, e);
+            throw new RuntimeException(e);
+        }
     }
 
     @Override
diff --git 
a/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java 
b/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java
similarity index 81%
copy from core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
copy to core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java
index 99fdd16c..99deee32 100644
--- a/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
+++ b/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java
@@ -20,6 +20,7 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Set;
 import org.apache.stormcrawler.util.ConfUtils;
 import org.jetbrains.annotations.Contract;
 import org.jsoup.helper.Validate;
@@ -57,61 +58,60 @@ import org.jsoup.select.NodeVisitor;
  *
  * @since 1.13
  */
-public class TextExtractor {
-
-    public static final String INCLUDE_PARAM_NAME = 
"textextractor.include.pattern";
-    public static final String EXCLUDE_PARAM_NAME = 
"textextractor.exclude.tags";
-    public static final String NO_TEXT_PARAM_NAME = "textextractor.no.text";
-    public static final String TEXT_MAX_TEXT_PARAM_NAME = 
"textextractor.skip.after";
+public class JSoupTextExtractor implements TextExtractor {
 
     private final List<String> inclusionPatterns;
-    private final HashSet<String> excludedTags;
+    private final Set<String> excludedTags;
     private final boolean noText;
     private final int maxTextSize;
 
-    public TextExtractor(Map<String, Object> stormConf) {
+    public JSoupTextExtractor(Map<String, Object> stormConf) {
         maxTextSize = ConfUtils.getInt(stormConf, TEXT_MAX_TEXT_PARAM_NAME, 
-1);
         noText = ConfUtils.getBoolean(stormConf, NO_TEXT_PARAM_NAME, false);
         inclusionPatterns = ConfUtils.loadListFromConf(INCLUDE_PARAM_NAME, 
stormConf);
-        excludedTags = new HashSet<String>();
+        excludedTags = new HashSet<>();
         ConfUtils.loadListFromConf(EXCLUDE_PARAM_NAME, stormConf)
                 .forEach((s) -> excludedTags.add(s.toLowerCase(Locale.ROOT)));
     }
 
-    public String text(Element element) {
+    public String text(Object o) {
         // not interested in getting any text?
         if (noText) return "";
 
-        final StringBuilder accum = new StringBuilder();
+        if (o instanceof Element element) {
+
+            final StringBuilder accum = new StringBuilder();
 
-        // no patterns at all - return the text from the whole document
-        if (inclusionPatterns.size() == 0 && excludedTags.size() == 0) {
-            _text(element, accum);
-        } else {
-            Elements matches = new Elements();
+            // no patterns at all - return the text from the whole document
+            if (inclusionPatterns.size() == 0 && excludedTags.size() == 0) {
+                text(element, accum);
+            } else {
+                Elements matches = new Elements();
 
-            for (String pattern : inclusionPatterns) {
-                matches = element.select(pattern);
-                if (!matches.isEmpty()) {
-                    break;
+                for (String pattern : inclusionPatterns) {
+                    matches = element.select(pattern);
+                    if (!matches.isEmpty()) {
+                        break;
+                    }
                 }
-            }
 
-            // if nothing matches or no patterns were defined use the whole doc
-            if (matches.isEmpty()) {
-                matches.add(element);
-            }
+                // if nothing matches or no patterns were defined use the 
whole doc
+                if (matches.isEmpty()) {
+                    matches.add(element);
+                }
 
-            for (Element node : matches) {
-                _text(node, accum);
-                accum.append("\n");
+                for (Element node : matches) {
+                    text(node, accum);
+                    accum.append("\n");
+                }
             }
-        }
 
-        return accum.toString().trim();
+            return accum.toString().trim();
+        }
+        return "";
     }
 
-    private void _text(Node node, final StringBuilder accum) {
+    private void text(Node node, final StringBuilder accum) {
         traverse(
                 new NodeVisitor() {
 
@@ -157,8 +157,7 @@ public class TextExtractor {
      * @param visitor Node visitor.
      * @param root the root node point to traverse.
      */
-    public static void traverse(
-            NodeVisitor visitor, Node root, int maxSize, StringBuilder 
builder) {
+    private void traverse(NodeVisitor visitor, Node root, int maxSize, 
StringBuilder builder) {
         Validate.notNull(visitor, "null visitor in traverse method");
         Validate.notNull(root, "null root node in traverse method");
         Node node = root;
@@ -219,8 +218,7 @@ public class TextExtractor {
         if (node == null) return false;
         // looks only at this element and five levels up, to prevent recursion 
&
         // needless stack searches
-        if (node instanceof Element) {
-            Element el = (Element) node;
+        if (node instanceof Element el) {
             int i = 0;
             do {
                 if (el.tag().preserveWhitespace()) return true;
@@ -232,6 +230,6 @@ public class TextExtractor {
     }
 
     static boolean lastCharIsWhitespace(StringBuilder sb) {
-        return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' ';
+        return !sb.isEmpty() && sb.charAt(sb.length() - 1) == ' ';
     }
 }
diff --git 
a/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java 
b/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
index 99fdd16c..1a29d58e 100644
--- a/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
+++ b/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
@@ -16,222 +16,12 @@
  */
 package org.apache.stormcrawler.parse;
 
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import org.apache.stormcrawler.util.ConfUtils;
-import org.jetbrains.annotations.Contract;
-import org.jsoup.helper.Validate;
-import org.jsoup.internal.StringUtil;
-import org.jsoup.nodes.CDataNode;
-import org.jsoup.nodes.Element;
-import org.jsoup.nodes.Node;
-import org.jsoup.nodes.TextNode;
-import org.jsoup.select.Elements;
-import org.jsoup.select.NodeVisitor;
+public interface TextExtractor {
 
-/**
- * Filters the text extracted from HTML documents, used by JSoupParserBolt. 
Configured with optional
- * inclusion patterns based on <a 
href="https://jsoup.org/cookbook/extracting-data/selector-syntax";>
- * JSoup selectors</a>, as well as a list of tags to be excluded.
- *
- * <p>Replaces ContentFilter.
- *
- * <p>The first matching inclusion pattern is used or the whole document if no 
expressions are
- * configured or no match has been found.
- *
- * <p>The TextExtraction can be configured as so:
- *
- * <pre>{@code
- * textextractor.include.pattern:
- *  - DIV[id="maincontent"]
- *  - DIV[itemprop="articleBody"]
- *  - ARTICLE
- *
- * textextractor.exclude.tags:
- *  - STYLE
- *  - SCRIPT
- *
- * }</pre>
- *
- * @since 1.13
- */
-public class TextExtractor {
-
-    public static final String INCLUDE_PARAM_NAME = 
"textextractor.include.pattern";
-    public static final String EXCLUDE_PARAM_NAME = 
"textextractor.exclude.tags";
-    public static final String NO_TEXT_PARAM_NAME = "textextractor.no.text";
-    public static final String TEXT_MAX_TEXT_PARAM_NAME = 
"textextractor.skip.after";
-
-    private final List<String> inclusionPatterns;
-    private final HashSet<String> excludedTags;
-    private final boolean noText;
-    private final int maxTextSize;
-
-    public TextExtractor(Map<String, Object> stormConf) {
-        maxTextSize = ConfUtils.getInt(stormConf, TEXT_MAX_TEXT_PARAM_NAME, 
-1);
-        noText = ConfUtils.getBoolean(stormConf, NO_TEXT_PARAM_NAME, false);
-        inclusionPatterns = ConfUtils.loadListFromConf(INCLUDE_PARAM_NAME, 
stormConf);
-        excludedTags = new HashSet<String>();
-        ConfUtils.loadListFromConf(EXCLUDE_PARAM_NAME, stormConf)
-                .forEach((s) -> excludedTags.add(s.toLowerCase(Locale.ROOT)));
-    }
-
-    public String text(Element element) {
-        // not interested in getting any text?
-        if (noText) return "";
-
-        final StringBuilder accum = new StringBuilder();
-
-        // no patterns at all - return the text from the whole document
-        if (inclusionPatterns.size() == 0 && excludedTags.size() == 0) {
-            _text(element, accum);
-        } else {
-            Elements matches = new Elements();
-
-            for (String pattern : inclusionPatterns) {
-                matches = element.select(pattern);
-                if (!matches.isEmpty()) {
-                    break;
-                }
-            }
-
-            // if nothing matches or no patterns were defined use the whole doc
-            if (matches.isEmpty()) {
-                matches.add(element);
-            }
-
-            for (Element node : matches) {
-                _text(node, accum);
-                accum.append("\n");
-            }
-        }
-
-        return accum.toString().trim();
-    }
-
-    private void _text(Node node, final StringBuilder accum) {
-        traverse(
-                new NodeVisitor() {
-
-                    private Node excluded = null;
-
-                    public void head(Node node, int depth) {
-                        if (excluded == null && node instanceof TextNode) {
-                            TextNode textNode = (TextNode) node;
-                            appendNormalisedText(accum, textNode);
-                        } else if (node instanceof Element) {
-                            Element element = (Element) node;
-                            if (excludedTags.contains(element.tagName())) {
-                                excluded = element;
-                            }
-                            if (accum.length() > 0
-                                    && (element.isBlock() || 
element.tag().getName().equals("br"))
-                                    && !lastCharIsWhitespace(accum)) 
accum.append(' ');
-                        }
-                    }
-
-                    public void tail(Node node, int depth) {
-                        // make sure there is a space between block tags and 
immediately
-                        // following text nodes <div>One</div>Two should be 
"One Two".
-                        if (node instanceof Element) {
-                            Element element = (Element) node;
-                            if (element == excluded) {
-                                excluded = null;
-                            }
-                            if (element.isBlock()
-                                    && (node.nextSibling() instanceof TextNode)
-                                    && !lastCharIsWhitespace(accum)) 
accum.append(' ');
-                        }
-                    }
-                },
-                node,
-                maxTextSize,
-                accum);
-    }
-
-    /**
-     * Start a depth-first traverse of the root and all of its descendants.
-     *
-     * @param visitor Node visitor.
-     * @param root the root node point to traverse.
-     */
-    public static void traverse(
-            NodeVisitor visitor, Node root, int maxSize, StringBuilder 
builder) {
-        Validate.notNull(visitor, "null visitor in traverse method");
-        Validate.notNull(root, "null root node in traverse method");
-        Node node = root;
-        int depth = 0;
-
-        while (node != null) {
-            // interrupts if too much text has already been produced
-            if (maxSize > 0 && builder.length() >= maxSize) return;
-
-            Node parent =
-                    node.parentNode(); // remember parent to find nodes that 
get replaced in .head
-            int origSize = parent != null ? parent.childNodeSize() : 0;
-            Node next = node.nextSibling();
-
-            visitor.head(node, depth); // visit current node
-            if (parent != null && !node.hasParent()) { // removed or replaced
-                if (origSize == parent.childNodeSize()) { // replaced
-                    node =
-                            parent.childNode(
-                                    node.siblingIndex()); // replace ditches 
parent but keeps
-                    // sibling index
-                } else { // removed
-                    node = next;
-                    if (node == null) { // last one, go up
-                        node = parent;
-                        depth--;
-                    }
-                    continue; // don't tail removed
-                }
-            }
-
-            if (node.childNodeSize() > 0) { // descend
-                node = node.childNode(0);
-                depth++;
-            } else {
-                // when no more siblings, ascend
-                while (node.nextSibling() == null && depth > 0) {
-                    visitor.tail(node, depth);
-                    node = node.parentNode();
-                    depth--;
-                }
-                visitor.tail(node, depth);
-                if (node == root) break;
-                node = node.nextSibling();
-            }
-        }
-    }
-
-    private static void appendNormalisedText(final StringBuilder accum, final 
TextNode textNode) {
-        final String text = textNode.getWholeText();
-        if (textNode instanceof CDataNode || 
preserveWhitespace(textNode.parent()))
-            accum.append(text);
-        else StringUtil.appendNormalisedWhitespace(accum, text, 
lastCharIsWhitespace(accum));
-    }
-
-    @Contract("null -> false")
-    static boolean preserveWhitespace(Node node) {
-        if (node == null) return false;
-        // looks only at this element and five levels up, to prevent recursion 
&
-        // needless stack searches
-        if (node instanceof Element) {
-            Element el = (Element) node;
-            int i = 0;
-            do {
-                if (el.tag().preserveWhitespace()) return true;
-                el = el.parent();
-                i++;
-            } while (i < 6 && el != null);
-        }
-        return false;
-    }
+    String INCLUDE_PARAM_NAME = "textextractor.include.pattern";
+    String EXCLUDE_PARAM_NAME = "textextractor.exclude.tags";
+    String NO_TEXT_PARAM_NAME = "textextractor.no.text";
+    String TEXT_MAX_TEXT_PARAM_NAME = "textextractor.skip.after";
 
-    static boolean lastCharIsWhitespace(StringBuilder sb) {
-        return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' ';
-    }
+    String text(Object element);
 }
diff --git a/core/src/main/resources/crawler-default.yaml 
b/core/src/main/resources/crawler-default.yaml
index 80ca74ed..27021730 100644
--- a/core/src/main/resources/crawler-default.yaml
+++ b/core/src/main/resources/crawler-default.yaml
@@ -244,6 +244,7 @@ config:
   detect.mimetype: true
   detect.charset.maxlength: 10000
 
+  #textextractor.class: "org.apache.stormcrawler.parse.JSoupTextExtractor"
   textextractor.skip.after: -1
 
   # filters URLs in sitemaps based on their modified Date (if any)
diff --git 
a/core/src/test/java/org/apache/stormcrawler/parse/TextExtractorTest.java 
b/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java
similarity index 80%
rename from 
core/src/test/java/org/apache/stormcrawler/parse/TextExtractorTest.java
rename to 
core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java
index a5b00662..288cdc8b 100644
--- a/core/src/test/java/org/apache/stormcrawler/parse/TextExtractorTest.java
+++ 
b/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java
@@ -27,13 +27,13 @@ import org.jsoup.nodes.Document;
 import org.jsoup.parser.Parser;
 import org.junit.jupiter.api.Test;
 
-class TextExtractorTest {
+class JSoupTextExtractorTest {
 
     @Test
     void testMainContent() throws IOException {
         Config conf = new Config();
-        conf.put(TextExtractor.INCLUDE_PARAM_NAME, "DIV[id=\"maincontent\"]");
-        TextExtractor extractor = new TextExtractor(conf);
+        conf.put(JSoupTextExtractor.INCLUDE_PARAM_NAME, 
"DIV[id=\"maincontent\"]");
+        JSoupTextExtractor extractor = new JSoupTextExtractor(conf);
         String content =
                 "<html>the<div id='maincontent'>main<div>content</div></div>of 
the page</html>";
         Document jsoupDoc = Parser.htmlParser().parseInput(content, 
"http://stormcrawler.net";);
@@ -44,8 +44,8 @@ class TextExtractorTest {
     @Test
     void testExclusion() throws IOException {
         Config conf = new Config();
-        conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "STYLE");
-        TextExtractor extractor = new TextExtractor(conf);
+        conf.put(JSoupTextExtractor.EXCLUDE_PARAM_NAME, "STYLE");
+        JSoupTextExtractor extractor = new JSoupTextExtractor(conf);
         String content = "<html>the<style>main</style>content of the 
page</html>";
         Document jsoupDoc = Parser.htmlParser().parseInput(content, 
"http://stormcrawler.net";);
         String text = extractor.text(jsoupDoc.body());
@@ -55,8 +55,8 @@ class TextExtractorTest {
     @Test
     void testExclusionCase() throws IOException {
         Config conf = new Config();
-        conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "style");
-        TextExtractor extractor = new TextExtractor(conf);
+        conf.put(JSoupTextExtractor.EXCLUDE_PARAM_NAME, "style");
+        JSoupTextExtractor extractor = new JSoupTextExtractor(conf);
         String content = "<html>the<STYLE>main</STYLE>content of the 
page</html>";
         Document jsoupDoc = Parser.htmlParser().parseInput(content, 
"http://stormcrawler.net";);
         String text = extractor.text(jsoupDoc.body());
@@ -68,14 +68,14 @@ class TextExtractorTest {
         Config conf = new Config();
         List<String> listinc = new LinkedList<>();
         listinc.add("ARTICLE");
-        conf.put(TextExtractor.INCLUDE_PARAM_NAME, listinc);
+        conf.put(JSoupTextExtractor.INCLUDE_PARAM_NAME, listinc);
         List<String> listex = new LinkedList<>();
         listex.add("STYLE");
         listex.add("SCRIPT");
-        conf.put(TextExtractor.EXCLUDE_PARAM_NAME, listex);
+        conf.put(JSoupTextExtractor.EXCLUDE_PARAM_NAME, listex);
         // set a limit
-        conf.put(TextExtractor.TEXT_MAX_TEXT_PARAM_NAME, 5123900);
-        TextExtractor extractor = new TextExtractor(conf);
+        conf.put(JSoupTextExtractor.TEXT_MAX_TEXT_PARAM_NAME, 5123900);
+        JSoupTextExtractor extractor = new JSoupTextExtractor(conf);
         String filename = "longtext.html";
         Document jsoupDoc =
                 Jsoup.parse(
diff --git a/external/ai/README.md b/external/ai/README.md
new file mode 100644
index 00000000..60bf4b9f
--- /dev/null
+++ b/external/ai/README.md
@@ -0,0 +1,85 @@
+# stormcrawler-aws
+================================
+
+The `OpenAiTextExtractor` is a StormCrawler-compatible content extraction 
component that uses a Large Language Model (LLM) via an OpenAI-compatible API 
to extract meaningful text from HTML documents.
+This enables context-aware and semantically rich extraction beyond traditional 
rule-based approaches.
+
+## Prerequisites
+
+Add `stormcrawler-ai` to the dependencies of your project\:
+
+```xml
+<dependency>
+    <groupId>org.apache.stormcrawler</groupId>
+    <artifactId>stormcrawler-ai</artifactId>
+    <version>XXXX</version>
+</dependency>
+```
+In addition, you either need access to an OpenAI-API compatible model serving 
hoster or provide an inference service yourself.
+
+## Features
+
+- Uses OpenAI-compatible LLMs (e.g., LLaMA 3) for intelligent HTML parsing and 
content extraction.
+- Customizable prompts for both system and user messages.
+- Easily integrates with StormCrawler parsing pipelines.
+- Optional listener interface for logging or usage metrics.
+
+## Configuration
+
+To use the `LlmTextExtractor`, your configuration file must include the 
following:
+
+```yaml
+# Required: specify the extractor class
+textextractor.class: "org.apache.stormcrawler.ai.OpenAITextExtractor"
+
+# Required: LLM API settings
+textextractor.llm.api_key: "<your-api-key>"
+textextractor.llm.url: "https://<your-openai-compatible-endpoint>"
+textextractor.llm.model: "<your-model-to-use>"
+
+# Optional: system prompt sent to LLM
+textextractor.system.prompt: "You are an expert in extracting content from 
plain HTML input."
+
+# Optional: user prompt template (with placeholders) for custom use cases. 
Note: We provide a default prompt in `src/main/resources/llm-default-prompt.txt`
+textextractor.llm.prompt: |
+  Please extract the main textual content from the following HTML:
+  {HTML}
+
+  {REQUEST}
+
+# Optional: extra request passed into the user prompt
+textextractor.llm.user_request: "Only include body content relevant to 
articles."
+
+# Optional: listener class implementing LlmResponseListener to hook into 
success/failure of LLM response, i.e. for tracking usage metrics.
+textextractor.llm.listener.clazz: "<your-listener-class>"
+```
+
+Note: You **must** set `textextractor.class` to use this extractor in a 
StormCrawler topology. 
+
+The `LlmTextExtractor` does not support the following configuration options 
from the default `TextExtractor`:
+
+- `textextractor.include.pattern`
+- `textextractor.exclude.tags`
+- `textextractor.no.text`
+- `textextractor.skip.after`
+
+## Additional Notes
+- **LLM Costs:** Calls to LLM APIs may incur costs - monitor usage if billing 
is a concern. In addition, certain providers might impose **rate limits**, 
which are not (yet) handled by our implementation as it is vendor specific 
behaviour.
+- **Performance:** LLM responses add latency to a crawl; this extractor is 
best used for high-value pages or specific use-cases.
+
+## Developer Notes
+
+For local testing, you can use a locally running instance of 
[Ollama](https://github.com/ollama/ollama), which provides an OpenAI-compatible 
API interface.
+
+### Setup Instructions
+
+- **Start Ollama** locally (either directly on your system or via Docker).
+- **Configure the following properties** in your application:
+
+```yaml
+   textextractor.llm.api_key: ""
+   textextractor.llm.url: "http://localhost:11434/v1";
+   textextractor.llm.model: "your-model-to-test"
+```
+
+- Ensure that the specified model (e.g., llama2, mistral, etc.) is already 
downloaded and ready for use in your local Ollama instance.
\ No newline at end of file
diff --git a/external/ai/ai-conf.yaml b/external/ai/ai-conf.yaml
new file mode 100644
index 00000000..404e3a62
--- /dev/null
+++ b/external/ai/ai-conf.yaml
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+###############################
+#  AI Configuration           #
+###############################
+
+textextractor.llm.api_key: ""
+textextractor.llm.url: ""
+textextractor.llm.model: ""
+
+# Allows to define a custom system prompt.
+#textextractor.system.prompt: "You are an expert in extracting content from 
plain HTML input."
+
+# Allows to define a custom prompt. {HTML} is replaced with the page html, 
{REQUEST} is replaced with the content of textextractor.llm.user_request
+#textextractor.llm.prompt: "see llm-default-prompt.txt - can be a multi line 
string with placeholders"
+
+# Allows to configure a special user request which the LLM should honour.
+#textextractor.llm.user_request: "-"
+
+# Allows to define the listener class to have the possibility to hook in usage 
metrics (i.e. for payment related metrics)
+#textextractor.llm.listener.clazz: 
"org.apache.stormcrawler.ai.listener.NoOpListener"
\ No newline at end of file
diff --git a/external/ai/pom.xml b/external/ai/pom.xml
new file mode 100644
index 00000000..b595e2c6
--- /dev/null
+++ b/external/ai/pom.xml
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0";
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>org.apache.stormcrawler</groupId>
+        <artifactId>stormcrawler-external</artifactId>
+        <version>3.3.1-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+
+
+    <artifactId>stormcrawler-ai</artifactId>
+    <name>stormcrawler-ai</name>
+
+    <url>https://github.com/apache/stormcrawler/tree/master/external/ai</url>
+    <description>AI resources for StormCrawler</description>
+
+    <properties>
+        <langchain4j.version>1.0.1</langchain4j.version>
+        <langchain4j.openai.version>1.0.1</langchain4j.openai.version>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>dev.langchain4j</groupId>
+            <artifactId>langchain4j</artifactId>
+            <version>${langchain4j.version}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.apache.opennlp</groupId>
+                    <artifactId>opennlp-tools</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>dev.langchain4j</groupId>
+            <artifactId>langchain4j-open-ai</artifactId>
+            <version>${langchain4j.openai.version}</version>
+        </dependency>
+    </dependencies>
+
+</project>
\ No newline at end of file
diff --git 
a/external/ai/src/main/java/org/apache/stormcrawler/ai/AbstractLLMTextExtractor.java
 
b/external/ai/src/main/java/org/apache/stormcrawler/ai/AbstractLLMTextExtractor.java
new file mode 100644
index 00000000..3fc360cd
--- /dev/null
+++ 
b/external/ai/src/main/java/org/apache/stormcrawler/ai/AbstractLLMTextExtractor.java
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stormcrawler.ai;
+
+import dev.langchain4j.data.message.SystemMessage;
+import dev.langchain4j.data.message.UserMessage;
+import dev.langchain4j.model.chat.ChatModel;
+import dev.langchain4j.model.chat.request.ChatRequest;
+import dev.langchain4j.model.chat.response.ChatResponse;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.lang.reflect.InvocationTargetException;
+import java.nio.charset.StandardCharsets;
+import java.util.Map;
+import org.apache.stormcrawler.ai.listener.LlmResponseListener;
+import org.apache.stormcrawler.ai.listener.NoOpListener;
+import org.apache.stormcrawler.parse.TextExtractor;
+import org.apache.stormcrawler.util.ConfUtils;
+import org.jsoup.nodes.Element;
+
+/**
+ * Abstract base class for LLM-based text extractors that use a {@link 
ChatModel} to convert HTML
+ * content into plain text. This class handles prompt preparation, 
configuration parsing, and model
+ * interaction.
+ *
+ * <p>Subclasses must implement {@link #getChatModel(Map)} to provide the 
appropriate chat model
+ * instance.
+ */
+public abstract class AbstractLLMTextExtractor implements TextExtractor {
+
+    public static final String API_KEY = "textextractor.llm.api_key";
+    public static final String BASE_URL = "textextractor.llm.url";
+    public static final String MODEL_NAME = "textextractor.llm.model";
+    public static final String SYSTEM_PROMPT = "textextractor.system.prompt";
+    public static final String USER_PROMPT = "textextractor.llm.prompt";
+    public static final String USER_REQUEST = "textextractor.llm.user_request";
+    public static final String LISTENER_CLASS = 
"textextractor.llm.listener.clazz";
+
+    private final ChatModel model;
+    private final SystemMessage systemMessage;
+    private final String userMessage;
+    private final String userRequest;
+    private final LlmResponseListener listener;
+
+    /**
+     * Constructs the extractor using the given configuration. Initializes the 
chat model,
+     * system/user prompts, user request, and listener.
+     *
+     * @param stormConf configuration map with model and prompt settings
+     */
+    public AbstractLLMTextExtractor(Map<String, Object> stormConf) {
+        this.model = getChatModel(stormConf);
+        this.systemMessage =
+                SystemMessage.from(
+                        ConfUtils.getString(
+                                stormConf,
+                                SYSTEM_PROMPT,
+                                "You are an expert in extracting content from 
plain HTML input."));
+        this.userMessage =
+                ConfUtils.getString(
+                        stormConf, USER_PROMPT, 
readFromClasspath("llm-default-prompt.txt"));
+        this.userRequest = ConfUtils.getString(stormConf, USER_REQUEST, "");
+        final String clazz =
+                ConfUtils.getString(stormConf, LISTENER_CLASS, 
NoOpListener.class.getName());
+        try {
+            listener =
+                    (LlmResponseListener)
+                            
Class.forName(clazz).getDeclaredConstructor().newInstance();
+        } catch (ClassNotFoundException
+                | InvocationTargetException
+                | InstantiationException
+                | IllegalAccessException
+                | NoSuchMethodException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Implemented by subclasses to return a specific {@link ChatModel} 
instance.
+     *
+     * @param stormConf the configuration map
+     * @return the chat model to be used
+     */
+    protected abstract ChatModel getChatModel(Map<String, Object> stormConf);
+
+    /**
+     * Reads a file from the classpath and returns its content as a UTF-8 
string.
+     *
+     * @param resource the name of the resource to read
+     * @return the content of the resource file
+     * @throws RuntimeException if the resource is not found or cannot be read
+     */
+    protected String readFromClasspath(String resource) {
+        try {
+            final ClassLoader classLoader = 
Thread.currentThread().getContextClassLoader();
+            try (InputStream is = classLoader.getResourceAsStream(resource)) {
+                if (is == null) {
+                    throw new FileNotFoundException("Resource not found: " + 
resource);
+                }
+                return new String(is.readAllBytes(), StandardCharsets.UTF_8);
+            }
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Extracts text from a given JSoup {@link Element} by sending a prompt to 
the LLM model.
+     *
+     * @param element an {@link Element} representing a portion of HTML
+     * @return the LLM-extracted plain text or an empty string on failure
+     */
+    @Override
+    public String text(Object element) {
+        if (element instanceof Element e) {
+            try {
+                final ChatRequest chatRequest =
+                        ChatRequest.builder()
+                                .messages(
+                                        systemMessage,
+                                        UserMessage.from(
+                                                
replacePlaceholders(userMessage, e.html())))
+                                .build();
+                final ChatResponse response = model.chat(chatRequest);
+                listener.onResponse(response);
+                return response.aiMessage().text();
+            } catch (RuntimeException ex) {
+                listener.onFailure(element, ex);
+            }
+        }
+        return "";
+    }
+
+    /**
+     * Replaces placeholders in the user message template with the actual HTML 
content and user
+     * request.
+     *
+     * @param userMessage the original user message template
+     * @param html the HTML string to insert
+     * @return the updated user message string with placeholders replaced
+     */
+    protected String replacePlaceholders(String userMessage, String html) {
+        userMessage = userMessage.replace("{HTML}", html);
+        userMessage = userMessage.replace("{REQUEST}", userRequest);
+        return userMessage;
+    }
+}
diff --git 
a/external/ai/src/main/java/org/apache/stormcrawler/ai/OpenAITextExtractor.java 
b/external/ai/src/main/java/org/apache/stormcrawler/ai/OpenAITextExtractor.java
new file mode 100644
index 00000000..4a212910
--- /dev/null
+++ 
b/external/ai/src/main/java/org/apache/stormcrawler/ai/OpenAITextExtractor.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stormcrawler.ai;
+
+import dev.langchain4j.model.chat.ChatModel;
+import dev.langchain4j.model.openai.OpenAiChatModel;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Map;
+import org.apache.storm.Config;
+import org.apache.stormcrawler.parse.TextExtractor;
+import org.apache.stormcrawler.util.ConfUtils;
+import org.jsoup.parser.Parser;
+
+/**
+ * A concrete implementation of {@link AbstractLLMTextExtractor} that uses the 
OpenAI-compatible
+ * model configuration to extract meaningful text from HTML content.
+ *
+ * <p>This class retrieves its configuration from a Storm configuration map 
and initializes an
+ * {@link OpenAiChatModel} accordingly. It can be used as a command-line tool 
to extract text from a
+ * given HTML file.
+ */
+public class OpenAITextExtractor extends AbstractLLMTextExtractor implements 
TextExtractor {
+
+    /**
+     * Constructs a new {@code OpenAITextExtractor} using the provided Storm 
configuration.
+     *
+     * @param stormConf a map containing the configuration parameters
+     */
+    public OpenAITextExtractor(Map<String, Object> stormConf) {
+        super(stormConf);
+    }
+
+    /**
+     * Builds and returns an {@link OpenAiChatModel} using parameters from the 
configuration.
+     *
+     * @param stormConf the configuration map containing model parameters
+     * @return an initialized {@link ChatModel} implementation
+     */
+    @Override
+    protected ChatModel getChatModel(Map<String, Object> stormConf) {
+        return OpenAiChatModel.builder()
+                .apiKey(ConfUtils.getString(stormConf, API_KEY))
+                .baseUrl(ConfUtils.getString(stormConf, BASE_URL))
+                .modelName(ConfUtils.getString(stormConf, MODEL_NAME))
+                .build();
+    }
+
+    /**
+     * Command-line entry point for testing the extractor locally.
+     *
+     * <p>Usage: {@code java OpenAITextExtractor config.yaml input.html}
+     *
+     * @param args command-line arguments: path to the config file and the 
HTML file
+     * @throws IOException if there is an error reading the configuration or 
HTML file
+     */
+    public static void main(String[] args) throws IOException {
+        final Map<String, Object> conf = ConfUtils.loadConf(args[0], new 
Config());
+        final OpenAITextExtractor textExtractor = new 
OpenAITextExtractor(conf);
+
+        final String html = Files.readString(Path.of(args[1]), 
StandardCharsets.UTF_8);
+
+        final String text = 
textExtractor.text(Parser.htmlParser().parseInput(html, "").body());
+
+        System.out.println(text);
+    }
+}
diff --git 
a/external/ai/src/main/java/org/apache/stormcrawler/ai/listener/LlmResponseListener.java
 
b/external/ai/src/main/java/org/apache/stormcrawler/ai/listener/LlmResponseListener.java
new file mode 100644
index 00000000..b0f0d98a
--- /dev/null
+++ 
b/external/ai/src/main/java/org/apache/stormcrawler/ai/listener/LlmResponseListener.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stormcrawler.ai.listener;
+
+/**
+ * A listener interface for handling responses and failures from LLM-based 
operations.
+ *
+ * <p>This interface allows implementers to process successful responses or 
handle errors that occur
+ * during interaction with a large language model (LLM).
+ */
+public interface LlmResponseListener {
+
+    /**
+     * Invoked when a response is successfully received from the LLM.
+     *
+     * @param o the response object (typically a {@link
+     *     dev.langchain4j.model.chat.response.ChatResponse})
+     */
+    void onResponse(Object o);
+
+    /**
+     * Invoked when an error occurs during LLM interaction.
+     *
+     * @param context an optional context object related to the failure, may 
be {@code null}
+     * @param e the exception that was thrown
+     */
+    void onFailure(Object context, Exception e);
+}
diff --git 
a/external/ai/src/main/java/org/apache/stormcrawler/ai/listener/NoOpListener.java
 
b/external/ai/src/main/java/org/apache/stormcrawler/ai/listener/NoOpListener.java
new file mode 100644
index 00000000..e063f654
--- /dev/null
+++ 
b/external/ai/src/main/java/org/apache/stormcrawler/ai/listener/NoOpListener.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stormcrawler.ai.listener;
+
+/**
+ * A no-op implementation of {@link LlmResponseListener} that ignores all 
responses and failures.
+ *
+ * <p>This is useful as a default listener when no specific behavior is needed 
for handling
+ * LLM outputs or errors.
+ */
+public class NoOpListener implements LlmResponseListener {
+
+    /**
+     * Does nothing when a response is received.
+     *
+     * @param o the response object (ignored)
+     */
+    @Override
+    public void onResponse(Object o) {
+        // No-op
+    }
+
+    /**
+     * Does nothing when a failure occurs.
+     *
+     * @param context the context of the failure (ignored)
+     * @param e the exception thrown (ignored)
+     */
+    @Override
+    public void onFailure(Object context, Exception e) {
+        // No-op
+    }
+}
\ No newline at end of file
diff --git a/external/ai/src/main/resources/llm-default-prompt.txt 
b/external/ai/src/main/resources/llm-default-prompt.txt
new file mode 100644
index 00000000..8ce4604b
--- /dev/null
+++ b/external/ai/src/main/resources/llm-default-prompt.txt
@@ -0,0 +1,44 @@
+Your task is to filter and convert HTML content into clean, focused markdown 
that's optimized for use with LLMs and information retrieval systems.
+
+TASK DETAILS:
+1. Content Selection
+- DO: Keep essential information, main content, key details
+- DO: Preserve hierarchical structure using markdown headers
+- DO: Keep code blocks, tables, key lists
+- DON'T: Include navigation menus, ads, footers, cookie notices
+- DON'T: Keep social media widgets, sidebars, related content
+
+2. Content Transformation
+- DO: Use proper markdown syntax (#, ##, **, `, etc)
+- DO: Convert tables to markdown tables
+- DO: Preserve code formatting with ```language blocks
+- DO: Maintain link texts but remove tracking parameters
+- DON'T: Include HTML tags in output
+- DON'T: Keep class names, ids, or other HTML attributes
+
+3. Content Organization
+- DO: Maintain logical flow of information
+- DO: Group related content under appropriate headers
+- DO: Use consistent header levels
+- DON'T: Fragment related content
+- DON'T: Duplicate information
+
+IMPORTANT: If user specific instruction is provided, ignore above guideline 
and prioritize those requirements over these general guidelines.
+
+OUTPUT FORMAT:
+Wrap your response in <content> tags. Use proper markdown throughout.
+<content>
+[Your markdown content here]
+</content>
+
+Begin filtering now.
+
+--------------------------------------------
+
+<|HTML_CONTENT_START|>
+{HTML}
+<|HTML_CONTENT_END|>
+
+<|USER_INSTRUCTION_START|>
+{REQUEST}
+<|USER_INSTRUCTION_END|>
\ No newline at end of file
diff --git 
a/external/ai/src/test/java/org/apache/stormcrawler/ai/OpenAITextExtractorTest.java
 
b/external/ai/src/test/java/org/apache/stormcrawler/ai/OpenAITextExtractorTest.java
new file mode 100644
index 00000000..f65ea9e2
--- /dev/null
+++ 
b/external/ai/src/test/java/org/apache/stormcrawler/ai/OpenAITextExtractorTest.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stormcrawler.ai;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.regex.Pattern;
+import org.apache.storm.Config;
+import org.apache.stormcrawler.parse.TextExtractor;
+import org.jsoup.parser.Parser;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable;
+
+@EnabledIfEnvironmentVariable(
+        named = "LLM_AVAILABLE",
+        matches = ".*",
+        disabledReason = "To run this test, there needs to be a running LLM 
instance available.")
+public class OpenAITextExtractorTest {
+
+    @Test
+    void testExtraction() throws URISyntaxException, IOException {
+        final Config conf = new Config();
+        conf.put(OpenAITextExtractor.API_KEY, 
System.getProperty("OPENAI_API_KEY", ""));
+        conf.put(
+                OpenAITextExtractor.BASE_URL,
+                System.getProperty("OPENAI_API_BASE_URL", 
"http://localhost:11434/v1";));
+        conf.put(
+                OpenAITextExtractor.MODEL_NAME,
+                System.getProperty("OPENAI_API_MODEL_NAME", "gemma3:latest"));
+
+        final TextExtractor textExtractor = new OpenAITextExtractor(conf);
+
+        final String html =
+                Files.readString(
+                        Path.of(
+                                Thread.currentThread()
+                                        .getContextClassLoader()
+                                        .getResource("stormcrawler.html")
+                                        .toURI()),
+                        StandardCharsets.UTF_8);
+        assertNotNull(html);
+
+        final String text = 
textExtractor.text(Parser.htmlParser().parseInput(html, "").body());
+
+        final Pattern tagPattern =
+                Pattern.compile(
+                        "<\\s*/?\\s*(div|script|html|body|footer)\\b[^>]*>",
+                        Pattern.CASE_INSENSITIVE);
+        assertFalse(
+                tagPattern.matcher(text).find(),
+                "Extracted text should not contain disallowed HTML tags");
+    }
+}
diff --git a/external/ai/src/test/resources/stormcrawler.html 
b/external/ai/src/test/resources/stormcrawler.html
new file mode 100644
index 00000000..43b85886
--- /dev/null
+++ b/external/ai/src/test/resources/stormcrawler.html
@@ -0,0 +1,118 @@
+
+<!DOCTYPE html>
+<html>
+
+<head>
+    <meta charset="utf-8">
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+
+    <title>Apache StormCrawler</title>
+    <meta name="description" content="Apache StormCrawler is collection of 
resources for building low-latency, scalable web crawlers on Apache Storm
+">
+
+    <link rel="stylesheet" href="/css/main.css">
+    <link rel="canonical" href="https://stormcrawler.apache.org/";>
+    <link rel="alternate" type="application/rss+xml" title="Apache 
StormCrawler" href="https://stormcrawler.apache.org/feed.xml";>
+    <link rel="icon" type="/image/png" href="/img/favicon.png" />
+</head>
+
+
+<body class="home">
+
+<header class="site-header">
+    <div class="site-header__wrap">
+        <div class="site-header__logo">
+            <a href="/"><img src="/img/logo-small.png" alt="Apache 
StormCrawler"></a>
+        </div>
+    </div>
+</header>
+<nav class="site-nav">
+    <ul>
+        <li><a href="/index.html">Home</a>
+        <li><a href="/download/index.html">Download</a>
+        <li><a href="/getting-started/">Getting Started</a></li>
+        <li><a href="/contribute/">Contribute</a></li>
+        <li><a 
href="https://javadoc.io/doc/org.apache.stormcrawler/stormcrawler-core/3.3.0/index.html";>JavaDocs</a>
+        <li><a href="/faq/">FAQ</a></li>
+        <li><a href="/support/">Support</a></li>
+    </ul>
+</nav>
+<span id="forkongithub"><a 
href="https://github.com/apache/incubator-stormcrawler";>Fork me on 
GitHub</a></span>
+
+
+<main class="main-content">
+    <div class="page-title">
+        <h1>A collection of resources for building low-latency, scalable web 
crawlers on Apache Storm®</h1>
+    </div>
+    </div>
+    <div class="row row-col">
+        <p><strong>Apache StormCrawler</strong> is an open source SDK for 
building distributed web crawlers based on <a 
href="http://storm.apache.org";>Apache Storm®</a>. The project is under Apache 
License v2 and consists of a collection of reusable resources and components, 
written mostly in Java.</p>
+        <p>The aim of Apache StormCrawler is to help build web crawlers that 
are :</p>
+        <ul>
+            <li>scalable</li>
+            <li>resilient</li>
+            <li>low latency</li>
+            <li>easy to extend</li>
+            <li>polite yet efficient</li>
+        </ul>
+        <p><strong>Apache StormCrawler</strong> is a library and collection of 
resources that developers can leverage to build their own crawlers. The good 
news is that doing so can be pretty straightforward! Have a look at the <a 
href="getting-started/">Getting Started</a> section for more details.</p>
+        <p>Apart from the core components, we provide some <a 
href="https://github.com/apache/incubator-stormcrawler/tree/main/external";>external
 resources</a> that you can reuse in your project, like for instance our spout 
and bolts for <a href="https://opensearch.org/";>OpenSearch®</a> or a ParserBolt 
which uses <a href="http://tika.apache.org";>Apache Tika®</a> to parse various 
document formats.</p>
+        <p><strong>Apache StormCrawler</strong> is perfectly suited to use 
cases where the URL to fetch and parse come as streams but is also an 
appropriate solution for large scale recursive crawls, particularly where low 
latency is required. The project is used in production by <a 
href="https://github.com/apache/incubator-stormcrawler/wiki/Powered-By";>many 
organisations</a> and is actively developed and maintained.</p>
+        <p>The <a 
href="https://github.com/apache/incubator-stormcrawler/wiki/Presentations";>Presentations</a>
 page contains links to some recent presentations made about this project.</p>
+    </div>
+
+    <div class="row row-col">
+        <div class="used-by-panel">
+            <h2>Used by</h2>
+            <a href="https://pixray.com/"; target="_blank">
+                <img src="/img/pixray.png" alt="Pixray" height=80>
+            </a>
+            <a href="https://www.gov.nt.ca/"; target="_blank">
+                <img src="/img/gnwt.png" alt="Government of Northwest 
Territories">
+            </a>
+            <a href="https://www.stolencamerafinder.com/"; target="_blank">
+                <img src="/img/stolen-camera-finder.png" 
alt="StolenCameraFinder">
+            </a>
+            <a href="https://www.polecat.com/"; target="_blank">
+                <img src="/img/polecat.svg" alt="Polecat" height=70>
+            </a>
+            <br>
+            <a 
href="http://github.com/apache/incubator-stormcrawler/wiki/Powered-By";>and many 
more...</a>
+        </div>
+    </div>
+
+</main>
+
+<footer class="site-footer">
+    &copy; 2025 <a href="https://www.apache.org/";>The Apache Software 
Foundation</a><br/><br/>
+    Licensed under the <a 
href="https://www.apache.org/licenses/LICENSE-2.0";>Apache License, Version 
2.0</a>. <br/> Apache StormCrawler, StormCrawler, the Apache feather logo are 
trademarks of The Apache Software Foundation. <br/> All other marks mentioned 
may be trademarks or registered trademarks of their respective owners. 
<br/><br/>
+    <a 
href="https://privacy.apache.org/policies/privacy-policy-public.html";>Privacy 
Policy</a> | <a href="https://www.apache.org/security/";>Security</a> | <a 
href="https://www.apache.org/foundation/sponsorship";>Sponsorship</a> | <a 
href="https://www.apache.org/foundation/sponsors";>Sponsors</a><br/><br/>
+    <div class="footer-widget">
+        <a class="acevent" data-format="wide" data-mode="dark"></a>
+    </div>
+</footer>
+
+
+</body>
+
+<script src="https://www.apachecon.com/event-images/snippet.js";></script>
+
+<!-- Matomo -->
+<script>
+    var _paq = window._paq = window._paq || [];
+    /* tracker methods like "setCustomDimension" should be called before 
"trackPageView" */
+    _paq.push(["setDoNotTrack", true]);
+    _paq.push(["disableCookies"]);
+    _paq.push(['trackPageView']);
+    _paq.push(['enableLinkTracking']);
+    (function() {
+      var u="https://analytics.apache.org/";;
+      _paq.push(['setTrackerUrl', u+'matomo.php']);
+      _paq.push(['setSiteId', '58']);
+      var d=document, g=d.createElement('script'), 
s=d.getElementsByTagName('script')[0];
+      g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
+    })();
+</script>
+<!-- End Matomo Code -->
+</html>
diff --git a/external/pom.xml b/external/pom.xml
index 21c6fe44..377d46bf 100644
--- a/external/pom.xml
+++ b/external/pom.xml
@@ -32,7 +32,7 @@ under the License.
        <artifactId>stormcrawler-external</artifactId>
        <packaging>pom</packaging>
 
-       <dependencies>
+    <dependencies>
                <dependency>
                        <groupId>org.apache.storm</groupId>
                        <artifactId>storm-client</artifactId>
diff --git a/pom.xml b/pom.xml
index af855c75..4c2f55b7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -497,6 +497,7 @@ under the License.
                                 <exclude>**/README.md</exclude>
                                 <exclude>**/target/**</exclude>
                                 <exclude>**/warc.inputs</exclude>
+                                <exclude>**/llm-default-prompt.txt</exclude>
                                 <exclude>LICENSE</exclude>
                                 <exclude>NOTICE</exclude>
                                 <exclude>DISCLAIMER</exclude>
@@ -593,6 +594,12 @@ under the License.
                                <version>${jackson.version}</version>
                        </dependency>
 
+                       <dependency>
+                               <groupId>com.fasterxml.jackson.core</groupId>
+                               <artifactId>jackson-core</artifactId>
+                               <version>${jackson.version}</version>
+                       </dependency>
+
                        <!-- 
https://mvnrepository.com/artifact/org.jetbrains/annotations -->
                        <dependency>
                                <groupId>org.jetbrains</groupId>
@@ -615,6 +622,7 @@ under the License.
        <modules>
                <module>core</module>
                <module>external</module>
+               <module>external/ai</module>
                <module>external/aws</module>
                <module>external/langid</module>
                <module>external/opensearch</module>

(stormcrawler) 01/01: #1558 - Add a LLM-based TextExtractor (OpenAI API compatible)

Reply via email to