I'm so new to Nutch that I wasn't sure yet how to tie the feature into a configuration file, but here's the first pass hardcoded version that seems to do ok. At least on the perfectly clean data that I've been feeding it. Probably blows up if someone forgets their <!--/htdig_noindex--> tag. I'd definitely like to see this feature get into the mainline Nutch codebase, but I am not quite up to speed w/Nutch enough to submit a credible patch.
-Jeff ============================= --- DOMContentUtils.java~ 2005-10-01 12:01:36.000000000 -0700 +++ DOMContentUtils.java 2005-12-27 08:13:00.000000000 -0800 @@ -25,6 +25,7 @@ import org.w3c.dom.*; + /** * A collection of methods for extracting content from DOM trees. * @@ -34,6 +35,8 @@ */ public class DOMContentUtils { + private static boolean htdig_noindex_mode = false; + public static class LinkParams { public String elName; public String attrName; @@ -101,6 +104,8 @@ private static final boolean getTextHelper(StringBuffer sb, Node node, boolean abortOnNestedAnchors, int anchorDepth) { + + if ("script".equalsIgnoreCase(node.getNodeName())) { return false; } @@ -113,6 +118,11 @@ return true; } if (node.getNodeType() == Node.COMMENT_NODE) { + String text = node.getNodeValue(); + if (text.equals("htdig_noindex")) + htdig_noindex_mode = true; + if (text.equals("/htdig_noindex")) + htdig_noindex_mode = false; return false; } if (node.getNodeType() == Node.TEXT_NODE) { @@ -120,7 +130,7 @@ String text = node.getNodeValue(); text = text.replaceAll("\\s+", " "); text = text.trim(); - if (text.length() > 0) { + if (text.length() > 0 && htdig_noindex_mode == false) { if (sb.length() > 0) sb.append(' '); sb.append(text); }