This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch genai-text-extractor in repository https://gitbox.apache.org/repos/asf/stormcrawler.git
commit 4734256c62fec54236e16490cbc613fb1e5de7b6 Author: Richard Zowalal <[email protected]> AuthorDate: Thu Jun 12 15:17:15 2025 +0200 Introduces TextExtractor as interface. Renames previous TextExtractor to JSoupTextExtractor. Allow configuration of text extractor used via config properties. --- .../apache/stormcrawler/bolt/JSoupParserBolt.java | 25 ++- ...{TextExtractor.java => JSoupTextExtractor.java} | 65 +++--- .../apache/stormcrawler/parse/TextExtractor.java | 219 +-------------------- core/src/main/resources/crawler-default.yaml | 1 + ...ractorTest.java => JSoupTextExtractorTest.java} | 22 +-- 5 files changed, 72 insertions(+), 260 deletions(-) diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java index 01d9d797..b6751914 100644 --- a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java @@ -21,6 +21,7 @@ import static org.apache.stormcrawler.Constants.StatusStreamName; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.lang.reflect.InvocationTargetException; import java.net.MalformedURLException; import java.net.URL; import java.nio.ByteBuffer; @@ -44,6 +45,7 @@ import org.apache.stormcrawler.Metadata; import org.apache.stormcrawler.parse.DocumentFragmentBuilder; import org.apache.stormcrawler.parse.JSoupFilter; import org.apache.stormcrawler.parse.JSoupFilters; +import org.apache.stormcrawler.parse.JSoupTextExtractor; import org.apache.stormcrawler.parse.Outlink; import org.apache.stormcrawler.parse.ParseData; import org.apache.stormcrawler.parse.ParseFilter; @@ -161,7 +163,28 @@ public class JSoupParserBolt extends StatusEmitterBolt { ignoreMetaRedirections = ConfUtils.getBoolean(conf, "jsoup.ignore.meta.redirections", false); - textExtractor = new TextExtractor(conf); + final String clazz = + ConfUtils.getString( + conf, "textextractor.class", JSoupTextExtractor.class.getName()); + try { + textExtractor = + (TextExtractor) + Class.forName(clazz) + .getDeclaredConstructor(Map.class) + .newInstance(conf); + } catch (ClassNotFoundException e) { + LOG.warn("Could not load configured textextractor.class '{}'.", clazz, e); + throw new RuntimeException(e); + } catch (NoSuchMethodException e) { + LOG.warn( + "Configured textextractor.class '{}' does not provide a Map argument constructor.", + clazz, + e); + throw new RuntimeException(e); + } catch (InvocationTargetException | InstantiationException | IllegalAccessException e) { + LOG.warn("Cannot instantiazr textextractor.class '{}'.", clazz, e); + throw new RuntimeException(e); + } } @Override diff --git a/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java b/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java similarity index 83% copy from core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java copy to core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java index 99fdd16c..55b9279f 100644 --- a/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java +++ b/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java @@ -20,6 +20,7 @@ import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Set; import org.apache.stormcrawler.util.ConfUtils; import org.jetbrains.annotations.Contract; import org.jsoup.helper.Validate; @@ -57,7 +58,7 @@ import org.jsoup.select.NodeVisitor; * * @since 1.13 */ -public class TextExtractor { +public class JSoupTextExtractor implements TextExtractor { public static final String INCLUDE_PARAM_NAME = "textextractor.include.pattern"; public static final String EXCLUDE_PARAM_NAME = "textextractor.exclude.tags"; @@ -65,53 +66,57 @@ public class TextExtractor { public static final String TEXT_MAX_TEXT_PARAM_NAME = "textextractor.skip.after"; private final List<String> inclusionPatterns; - private final HashSet<String> excludedTags; + private final Set<String> excludedTags; private final boolean noText; private final int maxTextSize; - public TextExtractor(Map<String, Object> stormConf) { + public JSoupTextExtractor(Map<String, Object> stormConf) { maxTextSize = ConfUtils.getInt(stormConf, TEXT_MAX_TEXT_PARAM_NAME, -1); noText = ConfUtils.getBoolean(stormConf, NO_TEXT_PARAM_NAME, false); inclusionPatterns = ConfUtils.loadListFromConf(INCLUDE_PARAM_NAME, stormConf); - excludedTags = new HashSet<String>(); + excludedTags = new HashSet<>(); ConfUtils.loadListFromConf(EXCLUDE_PARAM_NAME, stormConf) .forEach((s) -> excludedTags.add(s.toLowerCase(Locale.ROOT))); } - public String text(Element element) { + public String text(Object o) { // not interested in getting any text? if (noText) return ""; - final StringBuilder accum = new StringBuilder(); + if (o instanceof Element element) { - // no patterns at all - return the text from the whole document - if (inclusionPatterns.size() == 0 && excludedTags.size() == 0) { - _text(element, accum); - } else { - Elements matches = new Elements(); + final StringBuilder accum = new StringBuilder(); - for (String pattern : inclusionPatterns) { - matches = element.select(pattern); - if (!matches.isEmpty()) { - break; + // no patterns at all - return the text from the whole document + if (inclusionPatterns.size() == 0 && excludedTags.size() == 0) { + text(element, accum); + } else { + Elements matches = new Elements(); + + for (String pattern : inclusionPatterns) { + matches = element.select(pattern); + if (!matches.isEmpty()) { + break; + } } - } - // if nothing matches or no patterns were defined use the whole doc - if (matches.isEmpty()) { - matches.add(element); - } + // if nothing matches or no patterns were defined use the whole doc + if (matches.isEmpty()) { + matches.add(element); + } - for (Element node : matches) { - _text(node, accum); - accum.append("\n"); + for (Element node : matches) { + text(node, accum); + accum.append("\n"); + } } - } - return accum.toString().trim(); + return accum.toString().trim(); + } + return ""; } - private void _text(Node node, final StringBuilder accum) { + private void text(Node node, final StringBuilder accum) { traverse( new NodeVisitor() { @@ -157,8 +162,7 @@ public class TextExtractor { * @param visitor Node visitor. * @param root the root node point to traverse. */ - public static void traverse( - NodeVisitor visitor, Node root, int maxSize, StringBuilder builder) { + private void traverse(NodeVisitor visitor, Node root, int maxSize, StringBuilder builder) { Validate.notNull(visitor, "null visitor in traverse method"); Validate.notNull(root, "null root node in traverse method"); Node node = root; @@ -219,8 +223,7 @@ public class TextExtractor { if (node == null) return false; // looks only at this element and five levels up, to prevent recursion & // needless stack searches - if (node instanceof Element) { - Element el = (Element) node; + if (node instanceof Element el) { int i = 0; do { if (el.tag().preserveWhitespace()) return true; @@ -232,6 +235,6 @@ public class TextExtractor { } static boolean lastCharIsWhitespace(StringBuilder sb) { - return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' '; + return !sb.isEmpty() && sb.charAt(sb.length() - 1) == ' '; } } diff --git a/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java b/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java index 99fdd16c..c9678bd2 100644 --- a/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java +++ b/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java @@ -16,222 +16,7 @@ */ package org.apache.stormcrawler.parse; -import java.util.HashSet; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import org.apache.stormcrawler.util.ConfUtils; -import org.jetbrains.annotations.Contract; -import org.jsoup.helper.Validate; -import org.jsoup.internal.StringUtil; -import org.jsoup.nodes.CDataNode; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; -import org.jsoup.nodes.TextNode; -import org.jsoup.select.Elements; -import org.jsoup.select.NodeVisitor; +public interface TextExtractor { -/** - * Filters the text extracted from HTML documents, used by JSoupParserBolt. Configured with optional - * inclusion patterns based on <a href="https://jsoup.org/cookbook/extracting-data/selector-syntax"> - * JSoup selectors</a>, as well as a list of tags to be excluded. - * - * <p>Replaces ContentFilter. - * - * <p>The first matching inclusion pattern is used or the whole document if no expressions are - * configured or no match has been found. - * - * <p>The TextExtraction can be configured as so: - * - * <pre>{@code - * textextractor.include.pattern: - * - DIV[id="maincontent"] - * - DIV[itemprop="articleBody"] - * - ARTICLE - * - * textextractor.exclude.tags: - * - STYLE - * - SCRIPT - * - * }</pre> - * - * @since 1.13 - */ -public class TextExtractor { - - public static final String INCLUDE_PARAM_NAME = "textextractor.include.pattern"; - public static final String EXCLUDE_PARAM_NAME = "textextractor.exclude.tags"; - public static final String NO_TEXT_PARAM_NAME = "textextractor.no.text"; - public static final String TEXT_MAX_TEXT_PARAM_NAME = "textextractor.skip.after"; - - private final List<String> inclusionPatterns; - private final HashSet<String> excludedTags; - private final boolean noText; - private final int maxTextSize; - - public TextExtractor(Map<String, Object> stormConf) { - maxTextSize = ConfUtils.getInt(stormConf, TEXT_MAX_TEXT_PARAM_NAME, -1); - noText = ConfUtils.getBoolean(stormConf, NO_TEXT_PARAM_NAME, false); - inclusionPatterns = ConfUtils.loadListFromConf(INCLUDE_PARAM_NAME, stormConf); - excludedTags = new HashSet<String>(); - ConfUtils.loadListFromConf(EXCLUDE_PARAM_NAME, stormConf) - .forEach((s) -> excludedTags.add(s.toLowerCase(Locale.ROOT))); - } - - public String text(Element element) { - // not interested in getting any text? - if (noText) return ""; - - final StringBuilder accum = new StringBuilder(); - - // no patterns at all - return the text from the whole document - if (inclusionPatterns.size() == 0 && excludedTags.size() == 0) { - _text(element, accum); - } else { - Elements matches = new Elements(); - - for (String pattern : inclusionPatterns) { - matches = element.select(pattern); - if (!matches.isEmpty()) { - break; - } - } - - // if nothing matches or no patterns were defined use the whole doc - if (matches.isEmpty()) { - matches.add(element); - } - - for (Element node : matches) { - _text(node, accum); - accum.append("\n"); - } - } - - return accum.toString().trim(); - } - - private void _text(Node node, final StringBuilder accum) { - traverse( - new NodeVisitor() { - - private Node excluded = null; - - public void head(Node node, int depth) { - if (excluded == null && node instanceof TextNode) { - TextNode textNode = (TextNode) node; - appendNormalisedText(accum, textNode); - } else if (node instanceof Element) { - Element element = (Element) node; - if (excludedTags.contains(element.tagName())) { - excluded = element; - } - if (accum.length() > 0 - && (element.isBlock() || element.tag().getName().equals("br")) - && !lastCharIsWhitespace(accum)) accum.append(' '); - } - } - - public void tail(Node node, int depth) { - // make sure there is a space between block tags and immediately - // following text nodes <div>One</div>Two should be "One Two". - if (node instanceof Element) { - Element element = (Element) node; - if (element == excluded) { - excluded = null; - } - if (element.isBlock() - && (node.nextSibling() instanceof TextNode) - && !lastCharIsWhitespace(accum)) accum.append(' '); - } - } - }, - node, - maxTextSize, - accum); - } - - /** - * Start a depth-first traverse of the root and all of its descendants. - * - * @param visitor Node visitor. - * @param root the root node point to traverse. - */ - public static void traverse( - NodeVisitor visitor, Node root, int maxSize, StringBuilder builder) { - Validate.notNull(visitor, "null visitor in traverse method"); - Validate.notNull(root, "null root node in traverse method"); - Node node = root; - int depth = 0; - - while (node != null) { - // interrupts if too much text has already been produced - if (maxSize > 0 && builder.length() >= maxSize) return; - - Node parent = - node.parentNode(); // remember parent to find nodes that get replaced in .head - int origSize = parent != null ? parent.childNodeSize() : 0; - Node next = node.nextSibling(); - - visitor.head(node, depth); // visit current node - if (parent != null && !node.hasParent()) { // removed or replaced - if (origSize == parent.childNodeSize()) { // replaced - node = - parent.childNode( - node.siblingIndex()); // replace ditches parent but keeps - // sibling index - } else { // removed - node = next; - if (node == null) { // last one, go up - node = parent; - depth--; - } - continue; // don't tail removed - } - } - - if (node.childNodeSize() > 0) { // descend - node = node.childNode(0); - depth++; - } else { - // when no more siblings, ascend - while (node.nextSibling() == null && depth > 0) { - visitor.tail(node, depth); - node = node.parentNode(); - depth--; - } - visitor.tail(node, depth); - if (node == root) break; - node = node.nextSibling(); - } - } - } - - private static void appendNormalisedText(final StringBuilder accum, final TextNode textNode) { - final String text = textNode.getWholeText(); - if (textNode instanceof CDataNode || preserveWhitespace(textNode.parent())) - accum.append(text); - else StringUtil.appendNormalisedWhitespace(accum, text, lastCharIsWhitespace(accum)); - } - - @Contract("null -> false") - static boolean preserveWhitespace(Node node) { - if (node == null) return false; - // looks only at this element and five levels up, to prevent recursion & - // needless stack searches - if (node instanceof Element) { - Element el = (Element) node; - int i = 0; - do { - if (el.tag().preserveWhitespace()) return true; - el = el.parent(); - i++; - } while (i < 6 && el != null); - } - return false; - } - - static boolean lastCharIsWhitespace(StringBuilder sb) { - return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' '; - } + String text(Object element); } diff --git a/core/src/main/resources/crawler-default.yaml b/core/src/main/resources/crawler-default.yaml index 80ca74ed..27021730 100644 --- a/core/src/main/resources/crawler-default.yaml +++ b/core/src/main/resources/crawler-default.yaml @@ -244,6 +244,7 @@ config: detect.mimetype: true detect.charset.maxlength: 10000 + #textextractor.class: "org.apache.stormcrawler.parse.JSoupTextExtractor" textextractor.skip.after: -1 # filters URLs in sitemaps based on their modified Date (if any) diff --git a/core/src/test/java/org/apache/stormcrawler/parse/TextExtractorTest.java b/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java similarity index 80% rename from core/src/test/java/org/apache/stormcrawler/parse/TextExtractorTest.java rename to core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java index a5b00662..288cdc8b 100644 --- a/core/src/test/java/org/apache/stormcrawler/parse/TextExtractorTest.java +++ b/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java @@ -27,13 +27,13 @@ import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; import org.junit.jupiter.api.Test; -class TextExtractorTest { +class JSoupTextExtractorTest { @Test void testMainContent() throws IOException { Config conf = new Config(); - conf.put(TextExtractor.INCLUDE_PARAM_NAME, "DIV[id=\"maincontent\"]"); - TextExtractor extractor = new TextExtractor(conf); + conf.put(JSoupTextExtractor.INCLUDE_PARAM_NAME, "DIV[id=\"maincontent\"]"); + JSoupTextExtractor extractor = new JSoupTextExtractor(conf); String content = "<html>the<div id='maincontent'>main<div>content</div></div>of the page</html>"; Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://stormcrawler.net"); @@ -44,8 +44,8 @@ class TextExtractorTest { @Test void testExclusion() throws IOException { Config conf = new Config(); - conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "STYLE"); - TextExtractor extractor = new TextExtractor(conf); + conf.put(JSoupTextExtractor.EXCLUDE_PARAM_NAME, "STYLE"); + JSoupTextExtractor extractor = new JSoupTextExtractor(conf); String content = "<html>the<style>main</style>content of the page</html>"; Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://stormcrawler.net"); String text = extractor.text(jsoupDoc.body()); @@ -55,8 +55,8 @@ class TextExtractorTest { @Test void testExclusionCase() throws IOException { Config conf = new Config(); - conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "style"); - TextExtractor extractor = new TextExtractor(conf); + conf.put(JSoupTextExtractor.EXCLUDE_PARAM_NAME, "style"); + JSoupTextExtractor extractor = new JSoupTextExtractor(conf); String content = "<html>the<STYLE>main</STYLE>content of the page</html>"; Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://stormcrawler.net"); String text = extractor.text(jsoupDoc.body()); @@ -68,14 +68,14 @@ class TextExtractorTest { Config conf = new Config(); List<String> listinc = new LinkedList<>(); listinc.add("ARTICLE"); - conf.put(TextExtractor.INCLUDE_PARAM_NAME, listinc); + conf.put(JSoupTextExtractor.INCLUDE_PARAM_NAME, listinc); List<String> listex = new LinkedList<>(); listex.add("STYLE"); listex.add("SCRIPT"); - conf.put(TextExtractor.EXCLUDE_PARAM_NAME, listex); + conf.put(JSoupTextExtractor.EXCLUDE_PARAM_NAME, listex); // set a limit - conf.put(TextExtractor.TEXT_MAX_TEXT_PARAM_NAME, 5123900); - TextExtractor extractor = new TextExtractor(conf); + conf.put(JSoupTextExtractor.TEXT_MAX_TEXT_PARAM_NAME, 5123900); + JSoupTextExtractor extractor = new JSoupTextExtractor(conf); String filename = "longtext.html"; Document jsoupDoc = Jsoup.parse(
