Author: kubes Date: Fri Mar 9 19:55:23 2007 New Revision: 516648 URL: http://svn.apache.org/viewvc?view=rev&rev=516648 Log: NUTCH-436 resolved. Fixed behavior of urls with param (i.e. ;xxxx) information. My problem with EOL characters on commit should be resolved.
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?view=diff&rev=516648&r1=516647&r2=516648 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original) +++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Fri Mar 9 19:55:23 2007 @@ -1,353 +1,400 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.html; - -import java.net.URL; -import java.net.MalformedURLException; -import java.util.ArrayList; -import java.util.HashMap; - -import org.apache.nutch.parse.Outlink; -import org.apache.hadoop.conf.Configuration; - -import org.w3c.dom.*; - -/** - * A collection of methods for extracting content from DOM trees. - * - * This class holds a few utility methods for pulling content out of - * DOM nodes, such as getOutlinks, getText, etc. - * - */ -public class DOMContentUtils { - - public static class LinkParams { - public String elName; - public String attrName; - public int childLen; - - public LinkParams(String elName, String attrName, int childLen) { - this.elName = elName; - this.attrName = attrName; - this.childLen = childLen; - } - - public String toString() { - return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]"; - } - } - - private HashMap linkParams = new HashMap(); - private Configuration conf; - - - public DOMContentUtils(Configuration conf) { - setConf(conf); - } - - public void setConf(Configuration conf) { - this.conf = conf; - linkParams.clear(); - linkParams.put("a", new LinkParams("a", "href", 1)); - linkParams.put("area", new LinkParams("area", "href", 0)); - if (conf.getBoolean("parser.html.form.use_action", false)) { - linkParams.put("form", new LinkParams("form", "action", 1)); - } - linkParams.put("frame", new LinkParams("frame", "src", 0)); - linkParams.put("iframe", new LinkParams("iframe", "src", 0)); - linkParams.put("script", new LinkParams("script", "src", 0)); - linkParams.put("link", new LinkParams("link", "href", 0)); - linkParams.put("img", new LinkParams("img", "src", 0)); - } - - /** - * This method takes a [EMAIL PROTECTED] StringBuffer} and a DOM [EMAIL PROTECTED] Node}, - * and will append all the content text found beneath the DOM node to - * the <code>StringBuffer</code>. - * - * <p> - * - * If <code>abortOnNestedAnchors</code> is true, DOM traversal will - * be aborted and the <code>StringBuffer</code> will not contain - * any text encountered after a nested anchor is found. - * - * <p> - * - * @return true if nested anchors were found - */ - public boolean getText(StringBuffer sb, Node node, - boolean abortOnNestedAnchors) { - if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { - return true; - } - return false; - } - - - /** - * This is a convinience method, equivalent to [EMAIL PROTECTED] - * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. - * - */ - public void getText(StringBuffer sb, Node node) { - getText(sb, node, false); - } - - // returns true if abortOnNestedAnchors is true and we find nested - // anchors - private boolean getTextHelper(StringBuffer sb, Node node, - boolean abortOnNestedAnchors, - int anchorDepth) { - if ("script".equalsIgnoreCase(node.getNodeName())) { - return false; - } - if ("style".equalsIgnoreCase(node.getNodeName())) { - return false; - } - if (abortOnNestedAnchors && "a".equalsIgnoreCase(node.getNodeName())) { - anchorDepth++; - if (anchorDepth > 1) - return true; - } - if (node.getNodeType() == Node.COMMENT_NODE) { - return false; - } - if (node.getNodeType() == Node.TEXT_NODE) { - // cleanup and trim the value - String text = node.getNodeValue(); - text = text.replaceAll("\\s+", " "); - text = text.trim(); - if (text.length() > 0) { - if (sb.length() > 0) sb.append(' '); - sb.append(text); - } - } - boolean abort = false; - NodeList children = node.getChildNodes(); - if (children != null) { - int len = children.getLength(); - for (int i = 0; i < len; i++) { - if (getTextHelper(sb, children.item(i), - abortOnNestedAnchors, anchorDepth)) { - abort = true; - break; - } - } - } - return abort; - } - - /** - * This method takes a [EMAIL PROTECTED] StringBuffer} and a DOM [EMAIL PROTECTED] Node}, - * and will append the content text found beneath the first - * <code>title</code> node to the <code>StringBuffer</code>. - * - * @return true if a title node was found, false otherwise - */ - public boolean getTitle(StringBuffer sb, Node node) { - if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD - return false; - - if (node.getNodeType() == Node.ELEMENT_NODE) { - if ("title".equalsIgnoreCase(node.getNodeName())) { - getText(sb, node); - return true; - } - } - NodeList children = node.getChildNodes(); - if (children != null) { - int len = children.getLength(); - for (int i = 0; i < len; i++) { - if (getTitle(sb, children.item(i))) { - return true; - } - } - } - return false; - } - - /** If Node contains a BASE tag then it's HREF is returned. */ - public URL getBase(Node node) { - - // is this node a BASE tag? - if (node.getNodeType() == Node.ELEMENT_NODE) { - - if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD - return null; - - - if ("base".equalsIgnoreCase(node.getNodeName())) { - NamedNodeMap attrs = node.getAttributes(); - for (int i= 0; i < attrs.getLength(); i++ ) { - Node attr = attrs.item(i); - if ("href".equalsIgnoreCase(attr.getNodeName())) { - try { - return new URL(attr.getNodeValue()); - } catch (MalformedURLException e) {} - } - } - } - } - - // does it contain a base tag? - NodeList children = node.getChildNodes(); - if (children != null) { - int len = children.getLength(); - for (int i = 0; i < len; i++) { - URL base = getBase(children.item(i)); - if (base != null) - return base; - } - } - - // no. - return null; - } - - - private boolean hasOnlyWhiteSpace(Node node) { - String val= node.getNodeValue(); - for (int i= 0; i < val.length(); i++) { - if (!Character.isWhitespace(val.charAt(i))) - return false; - } - return true; - } - - // this only covers a few cases of empty links that are symptomatic - // of nekohtml's DOM-fixup process... - private boolean shouldThrowAwayLink(Node node, NodeList children, - int childLen, LinkParams params) { - if (childLen == 0) { - // this has no inner structure - if (params.childLen == 0) return false; - else return true; - } else if ((childLen == 1) - && (children.item(0).getNodeType() == Node.ELEMENT_NODE) - && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { - // single nested link - return true; - - } else if (childLen == 2) { - - Node c0= children.item(0); - Node c1= children.item(1); - - if ((c0.getNodeType() == Node.ELEMENT_NODE) - && (params.elName.equalsIgnoreCase(c0.getNodeName())) - && (c1.getNodeType() == Node.TEXT_NODE) - && hasOnlyWhiteSpace(c1) ) { - // single link followed by whitespace node - return true; - } - - if ((c1.getNodeType() == Node.ELEMENT_NODE) - && (params.elName.equalsIgnoreCase(c1.getNodeName())) - && (c0.getNodeType() == Node.TEXT_NODE) - && hasOnlyWhiteSpace(c0) ) { - // whitespace node followed by single link - return true; - } - - } else if (childLen == 3) { - Node c0= children.item(0); - Node c1= children.item(1); - Node c2= children.item(2); - - if ((c1.getNodeType() == Node.ELEMENT_NODE) - && (params.elName.equalsIgnoreCase(c1.getNodeName())) - && (c0.getNodeType() == Node.TEXT_NODE) - && (c2.getNodeType() == Node.TEXT_NODE) - && hasOnlyWhiteSpace(c0) - && hasOnlyWhiteSpace(c2) ) { - // single link surrounded by whitespace nodes - return true; - } - } - - return false; - } - - /** - * This method finds all anchors below the supplied DOM - * <code>node</code>, and creates appropriate [EMAIL PROTECTED] Outlink} - * records for each (relative to the supplied <code>base</code> - * URL), and adds them to the <code>outlinks</code> [EMAIL PROTECTED] - * ArrayList}. - * - * <p> - * - * Links without inner structure (tags, text, etc) are discarded, as - * are links which contain only single nested links and empty text - * nodes (this is a common DOM-fixup artifact, at least with - * nekohtml). - */ - public void getOutlinks(URL base, ArrayList outlinks, - Node node) { - - NodeList children = node.getChildNodes(); - int childLen= 0; - if (children != null) - childLen= children.getLength(); - - if (node.getNodeType() == Node.ELEMENT_NODE) { - String nodeName = node.getNodeName().toLowerCase(); - LinkParams params = (LinkParams)linkParams.get(nodeName); - if (params != null) { - if (!shouldThrowAwayLink(node, children, childLen, params)) { - - StringBuffer linkText = new StringBuffer(); - getText(linkText, node, true); - - NamedNodeMap attrs = node.getAttributes(); - String target = null; - boolean noFollow = false; - boolean post = false; - for (int i= 0; i < attrs.getLength(); i++ ) { - Node attr = attrs.item(i); - String attrName = attr.getNodeName(); - if (params.attrName.equalsIgnoreCase(attrName)) { - target = attr.getNodeValue(); - } else if ("rel".equalsIgnoreCase(attrName) && - "nofollow".equalsIgnoreCase(attr.getNodeValue())) { - noFollow = true; - } else if ("method".equalsIgnoreCase(attrName) && - "post".equalsIgnoreCase(attr.getNodeValue())) { - post = true; - } - } - if (target != null && !noFollow && !post) - try { - URL url = new URL(base, target); - outlinks.add(new Outlink(url.toString(), - linkText.toString().trim(), conf)); - } catch (MalformedURLException e) { - // don't care - } - } - // this should not have any children, skip them - if (params.childLen == 0) return; - } - } - for ( int i = 0; i < childLen; i++ ) { - getOutlinks(base, outlinks, children.item(i)); - } - } - -} - +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import java.net.URL; +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.HashMap; + +import org.apache.nutch.parse.Outlink; +import org.apache.hadoop.conf.Configuration; + +import org.w3c.dom.*; + +/** + * A collection of methods for extracting content from DOM trees. + * + * This class holds a few utility methods for pulling content out of + * DOM nodes, such as getOutlinks, getText, etc. + * + */ +public class DOMContentUtils { + + public static class LinkParams { + public String elName; + public String attrName; + public int childLen; + + public LinkParams(String elName, String attrName, int childLen) { + this.elName = elName; + this.attrName = attrName; + this.childLen = childLen; + } + + public String toString() { + return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]"; + } + } + + private HashMap linkParams = new HashMap(); + private Configuration conf; + + + public DOMContentUtils(Configuration conf) { + setConf(conf); + } + + public void setConf(Configuration conf) { + this.conf = conf; + linkParams.clear(); + linkParams.put("a", new LinkParams("a", "href", 1)); + linkParams.put("area", new LinkParams("area", "href", 0)); + if (conf.getBoolean("parser.html.form.use_action", false)) { + linkParams.put("form", new LinkParams("form", "action", 1)); + } + linkParams.put("frame", new LinkParams("frame", "src", 0)); + linkParams.put("iframe", new LinkParams("iframe", "src", 0)); + linkParams.put("script", new LinkParams("script", "src", 0)); + linkParams.put("link", new LinkParams("link", "href", 0)); + linkParams.put("img", new LinkParams("img", "src", 0)); + } + + /** + * This method takes a [EMAIL PROTECTED] StringBuffer} and a DOM [EMAIL PROTECTED] Node}, + * and will append all the content text found beneath the DOM node to + * the <code>StringBuffer</code>. + * + * <p> + * + * If <code>abortOnNestedAnchors</code> is true, DOM traversal will + * be aborted and the <code>StringBuffer</code> will not contain + * any text encountered after a nested anchor is found. + * + * <p> + * + * @return true if nested anchors were found + */ + public boolean getText(StringBuffer sb, Node node, + boolean abortOnNestedAnchors) { + if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { + return true; + } + return false; + } + + + /** + * This is a convinience method, equivalent to [EMAIL PROTECTED] + * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. + * + */ + public void getText(StringBuffer sb, Node node) { + getText(sb, node, false); + } + + // returns true if abortOnNestedAnchors is true and we find nested + // anchors + private boolean getTextHelper(StringBuffer sb, Node node, + boolean abortOnNestedAnchors, + int anchorDepth) { + if ("script".equalsIgnoreCase(node.getNodeName())) { + return false; + } + if ("style".equalsIgnoreCase(node.getNodeName())) { + return false; + } + if (abortOnNestedAnchors && "a".equalsIgnoreCase(node.getNodeName())) { + anchorDepth++; + if (anchorDepth > 1) + return true; + } + if (node.getNodeType() == Node.COMMENT_NODE) { + return false; + } + if (node.getNodeType() == Node.TEXT_NODE) { + // cleanup and trim the value + String text = node.getNodeValue(); + text = text.replaceAll("\\s+", " "); + text = text.trim(); + if (text.length() > 0) { + if (sb.length() > 0) sb.append(' '); + sb.append(text); + } + } + boolean abort = false; + NodeList children = node.getChildNodes(); + if (children != null) { + int len = children.getLength(); + for (int i = 0; i < len; i++) { + if (getTextHelper(sb, children.item(i), + abortOnNestedAnchors, anchorDepth)) { + abort = true; + break; + } + } + } + return abort; + } + + /** + * This method takes a [EMAIL PROTECTED] StringBuffer} and a DOM [EMAIL PROTECTED] Node}, + * and will append the content text found beneath the first + * <code>title</code> node to the <code>StringBuffer</code>. + * + * @return true if a title node was found, false otherwise + */ + public boolean getTitle(StringBuffer sb, Node node) { + if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD + return false; + + if (node.getNodeType() == Node.ELEMENT_NODE) { + if ("title".equalsIgnoreCase(node.getNodeName())) { + getText(sb, node); + return true; + } + } + NodeList children = node.getChildNodes(); + if (children != null) { + int len = children.getLength(); + for (int i = 0; i < len; i++) { + if (getTitle(sb, children.item(i))) { + return true; + } + } + } + return false; + } + + /** If Node contains a BASE tag then it's HREF is returned. */ + public URL getBase(Node node) { + + // is this node a BASE tag? + if (node.getNodeType() == Node.ELEMENT_NODE) { + + if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD + return null; + + + if ("base".equalsIgnoreCase(node.getNodeName())) { + NamedNodeMap attrs = node.getAttributes(); + for (int i= 0; i < attrs.getLength(); i++ ) { + Node attr = attrs.item(i); + if ("href".equalsIgnoreCase(attr.getNodeName())) { + try { + return new URL(attr.getNodeValue()); + } catch (MalformedURLException e) {} + } + } + } + } + + // does it contain a base tag? + NodeList children = node.getChildNodes(); + if (children != null) { + int len = children.getLength(); + for (int i = 0; i < len; i++) { + URL base = getBase(children.item(i)); + if (base != null) + return base; + } + } + + // no. + return null; + } + + + private boolean hasOnlyWhiteSpace(Node node) { + String val= node.getNodeValue(); + for (int i= 0; i < val.length(); i++) { + if (!Character.isWhitespace(val.charAt(i))) + return false; + } + return true; + } + + // this only covers a few cases of empty links that are symptomatic + // of nekohtml's DOM-fixup process... + private boolean shouldThrowAwayLink(Node node, NodeList children, + int childLen, LinkParams params) { + if (childLen == 0) { + // this has no inner structure + if (params.childLen == 0) return false; + else return true; + } else if ((childLen == 1) + && (children.item(0).getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { + // single nested link + return true; + + } else if (childLen == 2) { + + Node c0= children.item(0); + Node c1= children.item(1); + + if ((c0.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c0.getNodeName())) + && (c1.getNodeType() == Node.TEXT_NODE) + && hasOnlyWhiteSpace(c1) ) { + // single link followed by whitespace node + return true; + } + + if ((c1.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c1.getNodeName())) + && (c0.getNodeType() == Node.TEXT_NODE) + && hasOnlyWhiteSpace(c0) ) { + // whitespace node followed by single link + return true; + } + + } else if (childLen == 3) { + Node c0= children.item(0); + Node c1= children.item(1); + Node c2= children.item(2); + + if ((c1.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c1.getNodeName())) + && (c0.getNodeType() == Node.TEXT_NODE) + && (c2.getNodeType() == Node.TEXT_NODE) + && hasOnlyWhiteSpace(c0) + && hasOnlyWhiteSpace(c2) ) { + // single link surrounded by whitespace nodes + return true; + } + } + + return false; + } + + /** + * Handles cases where the url param information is encoded into the base + * url as opposed to the target. + * <p> + * If the taget contains params (i.e. ';xxxx') information then the target + * params information is assumed to be correct and any base params information + * is ignored. If the base contains params information but the tareget does + * not, then the params information is moved to the target allowing it to be + * correctly determined by the java.net.URL class. + * + * @param base The base URL. + * @param target The target path from the base URL. + * + * @return URL A URL with the params information correctly encoded. + * + * @throws MalformedURLException If the url is not a well formed URL. + */ + private URL fixEmbeddedParams(URL base, String target) + throws MalformedURLException{ + + // the target contains params information or the base doesn't then no + // conversion necessary, return regular URL + if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) { + return new URL(base, target); + } + + // get the base url and it params information + String baseURL = base.toString(); + int startParams = baseURL.indexOf(';'); + String params = baseURL.substring(startParams); + + // if the target has a query string then put the params information after + // any path but before the query string, otherwise just append to the path + int startQS = target.indexOf('?'); + if (startQS >= 0) { + target = target.substring(0, startQS) + params + + target.substring(startQS); + } + else { + target += params; + } + + return new URL(base, target); + } + + /** + * This method finds all anchors below the supplied DOM + * <code>node</code>, and creates appropriate [EMAIL PROTECTED] Outlink} + * records for each (relative to the supplied <code>base</code> + * URL), and adds them to the <code>outlinks</code> [EMAIL PROTECTED] + * ArrayList}. + * + * <p> + * + * Links without inner structure (tags, text, etc) are discarded, as + * are links which contain only single nested links and empty text + * nodes (this is a common DOM-fixup artifact, at least with + * nekohtml). + */ + public void getOutlinks(URL base, ArrayList outlinks, + Node node) { + + NodeList children = node.getChildNodes(); + int childLen= 0; + if (children != null) + childLen= children.getLength(); + + if (node.getNodeType() == Node.ELEMENT_NODE) { + String nodeName = node.getNodeName().toLowerCase(); + LinkParams params = (LinkParams)linkParams.get(nodeName); + if (params != null) { + if (!shouldThrowAwayLink(node, children, childLen, params)) { + + StringBuffer linkText = new StringBuffer(); + getText(linkText, node, true); + + NamedNodeMap attrs = node.getAttributes(); + String target = null; + boolean noFollow = false; + boolean post = false; + for (int i= 0; i < attrs.getLength(); i++ ) { + Node attr = attrs.item(i); + String attrName = attr.getNodeName(); + if (params.attrName.equalsIgnoreCase(attrName)) { + target = attr.getNodeValue(); + } else if ("rel".equalsIgnoreCase(attrName) && + "nofollow".equalsIgnoreCase(attr.getNodeValue())) { + noFollow = true; + } else if ("method".equalsIgnoreCase(attrName) && + "post".equalsIgnoreCase(attr.getNodeValue())) { + post = true; + } + } + if (target != null && !noFollow && !post) + try { + + URL url = (base.toString().indexOf(';') > 0) ? + fixEmbeddedParams(base, target) : new URL(base, target); + outlinks.add(new Outlink(url.toString(), + linkText.toString().trim(), conf)); + } catch (MalformedURLException e) { + // don't care + } + } + // this should not have any children, skip them + if (params.childLen == 0) return; + } + } + for ( int i = 0; i < childLen; i++ ) { + getOutlinks(base, outlinks, children.item(i)); + } + } + +} + Modified: lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?view=diff&rev=516648&r1=516647&r2=516648 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original) +++ lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Fri Mar 9 19:55:23 2007 @@ -1,376 +1,408 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.html; - -import junit.framework.TestCase; - -import org.apache.nutch.parse.Outlink; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; - -import java.io.ByteArrayInputStream; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.ArrayList; -import java.util.StringTokenizer; - -import org.cyberneko.html.parsers.*; -import org.xml.sax.*; -import org.w3c.dom.*; -import org.apache.html.dom.*; - -/** - * Unit tests for DOMContentUtils. - */ -public class TestDOMContentUtils extends TestCase { - - private static final String[] testPages= { - new String("<html><head><title> title </title><script> script </script>" - + "</head><body> body <a href=\"http://www.nutch.org\">" - + " anchor </a><!--comment-->" - + "</body></html>"), - new String("<html><head><title> title </title><script> script </script>" - + "</head><body> body <a href=\"/\">" - + " home </a><!--comment-->" - + "<style> style </style>" - + " <a href=\"bot.html\">" - + " bots </a>" - + "</body></html>"), - new String("<html><head><title> </title>" - + "</head><body> " - + "<a href=\"/\"> separate this " - + "<a href=\"ok\"> from this" - + "</a></a>" - + "</body></html>"), - // this one relies on certain neko fixup behavior, possibly - // distributing the anchors into the LI's-but not the other - // anchors (outside of them, instead)! So you get a tree that - // looks like: - // ... <li> <a href=/> home </a> </li> - // <li> <a href=/> <a href="1"> 1 </a> </a> </li> - // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> - new String("<html><head><title> my title </title>" - + "</head><body> body " - + "<ul>" - + "<li> <a href=\"/\"> home" - + "<li> <a href=\"1\"> 1" - + "<li> <a href=\"2\"> 2" - + "</ul>" - + "</body></html>"), - // test frameset link extraction. The invalid frame in the middle will be - // fixed to a third standalone frame. - new String("<html><head><title> my title </title>" - + "</head><frameset rows=\"20,*\"> " - + "<frame src=\"top.html\">" - + "</frame>" - + "<frameset cols=\"20,*\">" - + "<frame src=\"left.html\">" - + "<frame src=\"invalid.html\"/>" - + "</frame>" - + "<frame src=\"right.html\">" - + "</frame>" - + "</frameset>" - + "</frameset>" - + "</body></html>"), - // test <area> and <iframe> link extraction + url normalization - new String("<html><head><title> my title </title>" - + "</head><body>" - + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" - + "<map name=\"green\">" - + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" - + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" - + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" - + "</map>" - + "<a name=\"bottom\"/><h1> the bottom </h1> " - + "<iframe src=\"../docs/index.html\"/>" - + "</body></html>"), - // test whitespace processing for plain text extraction - new String("<html><head>\n <title> my\t\n title\r\n </title>\n" - + " </head>\n" - + " <body>\n" - + " <h1> Whitespace\ttest </h1> \n" - + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" - + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" - + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" - + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" - + "<table>" - + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" - + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" - + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" - + "</table>put some text here<Br>and there." - + "<h2>End\tthis\rmadness\n!</h2>\r\n" - + " . . . ." - + "</body> </html>"), - - // test that <a rel=nofollow> links are not returned - new String("<html><head></head><body>" - + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" - + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" - + "</body></html>"), - // test that POST form actions are skipped - new String("<html><head></head><body>" - + "<form method='POST' action='/search.jsp'><input type=text>" - + "<input type=submit><p>test1</p></form>" - + "<form method='GET' action='/dummy.jsp'><input type=text>" - + "<input type=submit><p>test2</p></form></body></html>"), - // test that all form actions are skipped - new String("<html><head></head><body>" - + "<form method='POST' action='/search.jsp'><input type=text>" - + "<input type=submit><p>test1</p></form>" - + "<form method='GET' action='/dummy.jsp'><input type=text>" - + "<input type=submit><p>test2</p></form></body></html>"), - }; - - private static int SKIP = 9; - - private static String[] testBaseHrefs= { - "http://www.nutch.org", - "http://www.nutch.org/docs/foo.html", - "http://www.nutch.org/docs/", - "http://www.nutch.org/docs/", - "http://www.nutch.org/frames/", - "http://www.nutch.org/maps/", - "http://www.nutch.org/whitespace/", - "http://www.nutch.org//", - "http://www.nutch.org/", - "http://www.nutch.org/", - }; - - private static final DocumentFragment testDOMs[]= - new DocumentFragment[testPages.length]; - - private static URL[] testBaseHrefURLs= - new URL[testPages.length]; - - - private static final String[] answerText= { - "title body anchor", - "title body home bots", - "separate this from this", - "my title body home 1 2", - "my title", - "my title the bottom", - "my title Whitespace test whitespace test " - + "This is a whitespace test . Newlines should appear as space too. " - + "Tabs are spaces too. This is a break -> and the line after break . " - + "one two three space here space there no space " - + "one two two three three four put some text here and there. " - + "End this madness ! . . . .", - "ignore ignore", - "test1 test2", - "test1 test2" - }; - - private static final String[] answerTitle= { - "title", - "title", - "", - "my title", - "my title", - "my title", - "my title", - "", - "", - "" - }; - - // note: should be in page-order - private static Outlink[][] answerOutlinks; - - private static Configuration conf; - private static DOMContentUtils utils = null; - - public TestDOMContentUtils(String name) { - super(name); - } - - private static void setup() { - conf = NutchConfiguration.create(); - conf.setBoolean("parser.html.form.use_action", true); - utils = new DOMContentUtils(conf); - DOMFragmentParser parser= new DOMFragmentParser(); - for (int i= 0; i < testPages.length; i++) { - DocumentFragment node= - new HTMLDocumentImpl().createDocumentFragment(); - try { - parser.parse( - new InputSource( - new ByteArrayInputStream(testPages[i].getBytes()) ), - node); - testBaseHrefURLs[i]= new URL(testBaseHrefs[i]); - } catch (Exception e) { - assertTrue("caught exception: " + e, false); - } - testDOMs[i]= node; - } - try { - answerOutlinks = new Outlink[][]{ - { - new Outlink("http://www.nutch.org", "anchor", conf), - }, - { - new Outlink("http://www.nutch.org/", "home", conf), - new Outlink("http://www.nutch.org/docs/bot.html", "bots", conf), - }, - { - new Outlink("http://www.nutch.org/", "separate this", conf), - new Outlink("http://www.nutch.org/docs/ok", "from this", conf), - }, - { - new Outlink("http://www.nutch.org/", "home", conf), - new Outlink("http://www.nutch.org/docs/1", "1", conf), - new Outlink("http://www.nutch.org/docs/2", "2", conf), - }, - { - new Outlink("http://www.nutch.org/frames/top.html", "", conf), - new Outlink("http://www.nutch.org/frames/left.html", "", conf), - new Outlink("http://www.nutch.org/frames/invalid.html", "", conf), - new Outlink("http://www.nutch.org/frames/right.html", "", conf), - }, - { - new Outlink("http://www.nutch.org/maps/logo.gif", "", conf), - new Outlink("http://www.nutch.org/index.html", "", conf), - new Outlink("http://www.nutch.org/maps/#bottom", "", conf), - new Outlink("http://www.nutch.org/bot.html", "", conf), - new Outlink("http://www.nutch.org/docs/index.html", "", conf), - }, - { - new Outlink("http://www.nutch.org/index.html", "whitespace test", conf), - }, - { - }, - { - new Outlink("http://www.nutch.org/dummy.jsp", "test2", conf), - }, - { - } - }; - - } catch (MalformedURLException e) { - - } - } - - private static boolean equalsIgnoreWhitespace(String s1, String s2) { - StringTokenizer st1= new StringTokenizer(s1); - StringTokenizer st2= new StringTokenizer(s2); - - while (st1.hasMoreTokens()) { - if (!st2.hasMoreTokens()) - return false; - if ( ! st1.nextToken().equals(st2.nextToken()) ) - return false; - } - if (st2.hasMoreTokens()) - return false; - return true; - } - - public void testGetText() { - if (testDOMs[0] == null) - setup(); - for (int i= 0; i < testPages.length; i++) { - StringBuffer sb= new StringBuffer(); - utils.getText(sb, testDOMs[i]); - String text= sb.toString(); - assertTrue("expecting text: " + answerText[i] - + System.getProperty("line.separator") - + System.getProperty("line.separator") - + "got text: "+ text, - equalsIgnoreWhitespace(answerText[i], text)); - } - } - - public void testGetTitle() { - if (testDOMs[0] == null) - setup(); - for (int i= 0; i < testPages.length; i++) { - StringBuffer sb= new StringBuffer(); - utils.getTitle(sb, testDOMs[i]); - String text= sb.toString(); - assertTrue("expecting text: " + answerText[i] - + System.getProperty("line.separator") - + System.getProperty("line.separator") - + "got text: "+ text, - equalsIgnoreWhitespace(answerTitle[i], text)); - } - } - - public void testGetOutlinks() { - if (testDOMs[0] == null) - setup(); - for (int i= 0; i < testPages.length; i++) { - ArrayList outlinks= new ArrayList(); - if (i == SKIP) { - conf.setBoolean("parser.html.form.use_action", false); - utils.setConf(conf); - } else { - conf.setBoolean("parser.html.form.use_action", true); - utils.setConf(conf); - } - utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]); - Outlink[] outlinkArr= new Outlink[outlinks.size()]; - outlinkArr= (Outlink[]) outlinks.toArray(outlinkArr); - compareOutlinks(answerOutlinks[i], outlinkArr); - } - } - - private static final void appendOutlinks(StringBuffer sb, Outlink[] o) { - for (int i= 0; i < o.length; i++) { - sb.append(o[i].toString()); - sb.append(System.getProperty("line.separator")); - } - } - - private static final String outlinksString(Outlink[] o) { - StringBuffer sb= new StringBuffer(); - appendOutlinks(sb, o); - return sb.toString(); - } - - private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) { - if (o1.length != o2.length) { - assertTrue("got wrong number of outlinks (expecting " + o1.length - + ", got " + o2.length + ")" - + System.getProperty("line.separator") - + "answer: " + System.getProperty("line.separator") - + outlinksString(o1) - + System.getProperty("line.separator") - + "got: " + System.getProperty("line.separator") - + outlinksString(o2) - + System.getProperty("line.separator"), - false - ); - } - - for (int i= 0; i < o1.length; i++) { - if (!o1[i].equals(o2[i])) { - assertTrue("got wrong outlinks at position " + i - + System.getProperty("line.separator") - + "answer: " + System.getProperty("line.separator") - + o1[i].toString() - + System.getProperty("line.separator") - + "got: " + System.getProperty("line.separator") - + o2[i].toString(), - false - ); - - } - } - } -} +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import junit.framework.TestCase; + +import org.apache.nutch.parse.Outlink; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +import java.io.ByteArrayInputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.StringTokenizer; + +import org.cyberneko.html.parsers.*; +import org.xml.sax.*; +import org.w3c.dom.*; +import org.apache.html.dom.*; + +/** + * Unit tests for DOMContentUtils. + */ +public class TestDOMContentUtils extends TestCase { + + private static final String[] testPages= { + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"http://www.nutch.org\">" + + " anchor </a><!--comment-->" + + "</body></html>"), + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"/\">" + + " home </a><!--comment-->" + + "<style> style </style>" + + " <a href=\"bot.html\">" + + " bots </a>" + + "</body></html>"), + new String("<html><head><title> </title>" + + "</head><body> " + + "<a href=\"/\"> separate this " + + "<a href=\"ok\"> from this" + + "</a></a>" + + "</body></html>"), + // this one relies on certain neko fixup behavior, possibly + // distributing the anchors into the LI's-but not the other + // anchors (outside of them, instead)! So you get a tree that + // looks like: + // ... <li> <a href=/> home </a> </li> + // <li> <a href=/> <a href="1"> 1 </a> </a> </li> + // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> + new String("<html><head><title> my title </title>" + + "</head><body> body " + + "<ul>" + + "<li> <a href=\"/\"> home" + + "<li> <a href=\"1\"> 1" + + "<li> <a href=\"2\"> 2" + + "</ul>" + + "</body></html>"), + // test frameset link extraction. The invalid frame in the middle will be + // fixed to a third standalone frame. + new String("<html><head><title> my title </title>" + + "</head><frameset rows=\"20,*\"> " + + "<frame src=\"top.html\">" + + "</frame>" + + "<frameset cols=\"20,*\">" + + "<frame src=\"left.html\">" + + "<frame src=\"invalid.html\"/>" + + "</frame>" + + "<frame src=\"right.html\">" + + "</frame>" + + "</frameset>" + + "</frameset>" + + "</body></html>"), + // test <area> and <iframe> link extraction + url normalization + new String("<html><head><title> my title </title>" + + "</head><body>" + + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" + + "<map name=\"green\">" + + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" + + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" + + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" + + "</map>" + + "<a name=\"bottom\"/><h1> the bottom </h1> " + + "<iframe src=\"../docs/index.html\"/>" + + "</body></html>"), + // test whitespace processing for plain text extraction + new String("<html><head>\n <title> my\t\n title\r\n </title>\n" + + " </head>\n" + + " <body>\n" + + " <h1> Whitespace\ttest </h1> \n" + + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" + + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" + + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" + + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" + + "<table>" + + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" + + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" + + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" + + "</table>put some text here<Br>and there." + + "<h2>End\tthis\rmadness\n!</h2>\r\n" + + " . . . ." + + "</body> </html>"), + + // test that <a rel=nofollow> links are not returned + new String("<html><head></head><body>" + + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" + + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" + + "</body></html>"), + // test that POST form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + // test that all form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + new String("<html><head><title> title </title>" + + "</head><body>" + + "<a href=\";x\">anchor1</a>" + + "<a href=\"g;x\">anchor2</a>" + + "<a href=\"g;x?y#s\">anchor3</a>" + + "</body></html>"), + new String("<html><head><title> title </title>" + + "</head><body>" + + "<a href=\"g\">anchor1</a>" + + "<a href=\"g?y#s\">anchor2</a>" + + "<a href=\"?y=1\">anchor3</a>" + + "<a href=\"?y=1#s\">anchor4</a>" + + "<a href=\"?y=1;somethingelse\">anchor5</a>" + + "</body></html>"), + }; + + private static int SKIP = 9; + + private static String[] testBaseHrefs= { + "http://www.nutch.org", + "http://www.nutch.org/docs/foo.html", + "http://www.nutch.org/docs/", + "http://www.nutch.org/docs/", + "http://www.nutch.org/frames/", + "http://www.nutch.org/maps/", + "http://www.nutch.org/whitespace/", + "http://www.nutch.org//", + "http://www.nutch.org/", + "http://www.nutch.org/", + "http://www.nutch.org/", + "http://www.nutch.org/;something" + }; + + private static final DocumentFragment testDOMs[]= + new DocumentFragment[testPages.length]; + + private static URL[] testBaseHrefURLs= + new URL[testPages.length]; + + + private static final String[] answerText= { + "title body anchor", + "title body home bots", + "separate this from this", + "my title body home 1 2", + "my title", + "my title the bottom", + "my title Whitespace test whitespace test " + + "This is a whitespace test . Newlines should appear as space too. " + + "Tabs are spaces too. This is a break -> and the line after break . " + + "one two three space here space there no space " + + "one two two three three four put some text here and there. " + + "End this madness ! . . . .", + "ignore ignore", + "test1 test2", + "test1 test2", + "title anchor1 anchor2 anchor3", + "title anchor1 anchor2 anchor3 anchor4 anchor5" + }; + + private static final String[] answerTitle= { + "title", + "title", + "", + "my title", + "my title", + "my title", + "my title", + "", + "", + "", + "title", + "title" + }; + + // note: should be in page-order + private static Outlink[][] answerOutlinks; + + private static Configuration conf; + private static DOMContentUtils utils = null; + + public TestDOMContentUtils(String name) { + super(name); + } + + private static void setup() { + conf = NutchConfiguration.create(); + conf.setBoolean("parser.html.form.use_action", true); + utils = new DOMContentUtils(conf); + DOMFragmentParser parser= new DOMFragmentParser(); + for (int i= 0; i < testPages.length; i++) { + DocumentFragment node= + new HTMLDocumentImpl().createDocumentFragment(); + try { + parser.parse( + new InputSource( + new ByteArrayInputStream(testPages[i].getBytes()) ), + node); + testBaseHrefURLs[i]= new URL(testBaseHrefs[i]); + } catch (Exception e) { + assertTrue("caught exception: " + e, false); + } + testDOMs[i]= node; + } + try { + answerOutlinks = new Outlink[][]{ + { + new Outlink("http://www.nutch.org", "anchor", conf), + }, + { + new Outlink("http://www.nutch.org/", "home", conf), + new Outlink("http://www.nutch.org/docs/bot.html", "bots", conf), + }, + { + new Outlink("http://www.nutch.org/", "separate this", conf), + new Outlink("http://www.nutch.org/docs/ok", "from this", conf), + }, + { + new Outlink("http://www.nutch.org/", "home", conf), + new Outlink("http://www.nutch.org/docs/1", "1", conf), + new Outlink("http://www.nutch.org/docs/2", "2", conf), + }, + { + new Outlink("http://www.nutch.org/frames/top.html", "", conf), + new Outlink("http://www.nutch.org/frames/left.html", "", conf), + new Outlink("http://www.nutch.org/frames/invalid.html", "", conf), + new Outlink("http://www.nutch.org/frames/right.html", "", conf), + }, + { + new Outlink("http://www.nutch.org/maps/logo.gif", "", conf), + new Outlink("http://www.nutch.org/index.html", "", conf), + new Outlink("http://www.nutch.org/maps/#bottom", "", conf), + new Outlink("http://www.nutch.org/bot.html", "", conf), + new Outlink("http://www.nutch.org/docs/index.html", "", conf), + }, + { + new Outlink("http://www.nutch.org/index.html", "whitespace test", conf), + }, + { + }, + { + new Outlink("http://www.nutch.org/dummy.jsp", "test2", conf), + }, + { + }, + { + new Outlink("http://www.nutch.org/;x", "anchor1", conf), + new Outlink("http://www.nutch.org/g;x", "anchor2", conf), + new Outlink("http://www.nutch.org/g;x?y#s", "anchor3", conf) + }, + { + new Outlink("http://www.nutch.org/g;something", "anchor1", conf), + new Outlink("http://www.nutch.org/g;something?y#s", "anchor2", conf), + new Outlink("http://www.nutch.org/;something?y=1", "anchor3", conf), + new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4", conf), + new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5", conf) + } + }; + + } catch (MalformedURLException e) { + + } + } + + private static boolean equalsIgnoreWhitespace(String s1, String s2) { + StringTokenizer st1= new StringTokenizer(s1); + StringTokenizer st2= new StringTokenizer(s2); + + while (st1.hasMoreTokens()) { + if (!st2.hasMoreTokens()) + return false; + if ( ! st1.nextToken().equals(st2.nextToken()) ) + return false; + } + if (st2.hasMoreTokens()) + return false; + return true; + } + + public void testGetText() { + if (testDOMs[0] == null) + setup(); + for (int i= 0; i < testPages.length; i++) { + StringBuffer sb= new StringBuffer(); + utils.getText(sb, testDOMs[i]); + String text= sb.toString(); + assertTrue("expecting text: " + answerText[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + + "got text: "+ text, + equalsIgnoreWhitespace(answerText[i], text)); + } + } + + public void testGetTitle() { + if (testDOMs[0] == null) + setup(); + for (int i= 0; i < testPages.length; i++) { + StringBuffer sb= new StringBuffer(); + utils.getTitle(sb, testDOMs[i]); + String text= sb.toString(); + assertTrue("expecting text: " + answerText[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + + "got text: "+ text, + equalsIgnoreWhitespace(answerTitle[i], text)); + } + } + + public void testGetOutlinks() { + if (testDOMs[0] == null) + setup(); + for (int i= 0; i < testPages.length; i++) { + ArrayList outlinks= new ArrayList(); + if (i == SKIP) { + conf.setBoolean("parser.html.form.use_action", false); + utils.setConf(conf); + } else { + conf.setBoolean("parser.html.form.use_action", true); + utils.setConf(conf); + } + utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]); + Outlink[] outlinkArr= new Outlink[outlinks.size()]; + outlinkArr= (Outlink[]) outlinks.toArray(outlinkArr); + compareOutlinks(answerOutlinks[i], outlinkArr); + } + } + + private static final void appendOutlinks(StringBuffer sb, Outlink[] o) { + for (int i= 0; i < o.length; i++) { + sb.append(o[i].toString()); + sb.append(System.getProperty("line.separator")); + } + } + + private static final String outlinksString(Outlink[] o) { + StringBuffer sb= new StringBuffer(); + appendOutlinks(sb, o); + return sb.toString(); + } + + private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) { + if (o1.length != o2.length) { + assertTrue("got wrong number of outlinks (expecting " + o1.length + + ", got " + o2.length + ")" + + System.getProperty("line.separator") + + "answer: " + System.getProperty("line.separator") + + outlinksString(o1) + + System.getProperty("line.separator") + + "got: " + System.getProperty("line.separator") + + outlinksString(o2) + + System.getProperty("line.separator"), + false + ); + } + + for (int i= 0; i < o1.length; i++) { + if (!o1[i].equals(o2[i])) { + assertTrue("got wrong outlinks at position " + i + + System.getProperty("line.separator") + + "answer: " + System.getProperty("line.separator") + + o1[i].toString() + + System.getProperty("line.separator") + + "got: " + System.getProperty("line.separator") + + o2[i].toString(), + false + ); + + } + } + } +} ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs