Author: olegk
Date: Fri Nov 14 11:53:13 2008
New Revision: 714116
URL: http://svn.apache.org/viewvc?rev=714116&view=rev
Log:
Use SAX parser / LinkExtractor instead of DOM fragment parser for HTML parsing
in Droids Core
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/impl/SequentialTaskMaster.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/impl/SequentialTaskMaster.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/impl/SequentialTaskMaster.java?rev=714116&r1=714115&r2=714116&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/impl/SequentialTaskMaster.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/impl/SequentialTaskMaster.java
Fri Nov 14 11:53:13 2008
@@ -86,10 +86,10 @@
}
switch (result) {
case WARN:
- log.warn(ex.getMessage());
+ log.warn(ex.toString());
break;
case FATAL:
- log.warn(ex.getMessage());
+ log.warn(ex.getMessage(), ex);
terminated = true;
break;
}
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java?rev=714116&r1=714115&r2=714116&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
Fri Nov 14 11:53:13 2008
@@ -18,31 +18,21 @@
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
-import java.net.URISyntaxException;
-import java.util.ArrayList;
import java.util.HashMap;
-import java.util.HashSet;
import java.util.Map;
+import org.apache.droids.ParseData;
import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Link;
import org.apache.droids.api.Parse;
import org.apache.droids.api.Parser;
import org.apache.droids.exception.ContentFormatViolationException;
import org.apache.droids.exception.DroidsException;
-import org.apache.droids.exception.InvalidLinkException;
import org.apache.droids.helper.Loggable;
-import org.apache.droids.LinkTask;
-import org.apache.droids.ParseData;
import org.apache.droids.parse.ParseImpl;
-import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.cyberneko.html.filters.ElementRemover;
-import org.cyberneko.html.parsers.DOMFragmentParser;
-import org.w3c.dom.DocumentFragment;
-import org.w3c.dom.NamedNodeMap;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
+import org.cyberneko.html.parsers.SAXParser;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
@@ -76,36 +66,24 @@
this.base = newLink.getURI();
ParseData parseData = null;
// setup filter chain
- final XMLDocumentFilter[] filters = { getRemover() };
+ XMLDocumentFilter[] filters = { getRemover() };
// create HTML parser
- final DOMFragmentParser parser = getParser(filters);
- final DocumentFragment node = new
HTMLDocumentImpl().createDocumentFragment();
- // parse document
- // XMLInputSource source = new XMLInputSource(null, uri, uri);
+ SAXParser parser = getParser(filters);
+ LinkExtractor linkExtractor = new LinkExtractor(newLink, elements);
+ parser.setContentHandler(linkExtractor);
InputStream instream = entity.obtainContent();
try {
- parser.parse(new InputSource(instream), node);
+ parser.parse(new InputSource(instream));
} catch (SAXException ex) {
throw new ContentFormatViolationException("Failure parsing HTML
content", ex);
} finally {
instream.close();
}
- parseData = extract(node);
- return new ParseImpl(newLink.getId(), parseData);
+ return new ParseImpl(newLink.getId(), new
ParseData(linkExtractor.getLinks()));
}
- private ParseData extract(DocumentFragment node) throws InvalidLinkException
{
- final ArrayList<Link> links = new ArrayList<Link>();
- try {
- extractLinks(node, links, new HashSet<URI>());
- } catch (URISyntaxException ex) {
- throw new InvalidLinkException("Invalid URI: " + ex.getInput(), ex);
- }
- return new ParseData(links);
- }
-
- private DOMFragmentParser getParser(XMLDocumentFilter[] filters) {
- final DOMFragmentParser parser = new DOMFragmentParser();
+ private SAXParser getParser(XMLDocumentFilter[] filters) {
+ SAXParser parser = new SAXParser();
try {
parser.setProperty("http://cyberneko.org/html/properties/filters",
filters);
parser.setFeature(
@@ -138,49 +116,4 @@
return remover;
}
- private void extractLinks(Node node, ArrayList<Link> links,
- HashSet<URI> set) throws URISyntaxException {
- if (node.getNodeType() == Node.ELEMENT_NODE) {
- String nodeName = node.getNodeName().toLowerCase();
- if (elements.containsKey(nodeName)) {
- String value = elements.get(nodeName);
- NamedNodeMap attrs = node.getAttributes();
- for (int i = 0; i < attrs.getLength(); i++) {
- Node attr = attrs.item(i);
- String attrName = attr.getNodeName();
- if (attrName.equalsIgnoreCase(value)) {
- String ref = attr.getNodeValue();
- URI newUri = null;
- if(ref.startsWith("/")){
- newUri = new URI(
- base.getScheme(), base.getUserInfo(), base.getHost(),
base.getPort(),
- ref, null, null);
- }else if(!ref.toLowerCase().startsWith("javascript")){
- newUri = base.resolve(new URI(ref));
- }
- if (newUri != null) {
- // Link from, URI uri, int depth, String text
- final LinkTask outlink = new LinkTask( link, newUri,
link.getDepth()+1 );
- if (log.isDebugEnabled()) {
- log.debug("set size: "+set.size());
- log.debug("outlink.getToUrl(): "+outlink.getURI());
- log.debug("set.contains(outlink.getToUrl(): " +
set.contains(newUri));
- }
- if (!set.contains(newUri)) {
- set.add(newUri);
- links.add(outlink);
- }
- }
- }
- }
- }
- }
- final NodeList children = node.getChildNodes();
- if (children != null) {
- int len = children.getLength();
- for (int i = 0; i < len; i++) {
- extractLinks(children.item(i), links, set);
- }
- }
- }
}
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java?rev=714116&r1=714115&r2=714116&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java
Fri Nov 14 11:53:13 2008
@@ -37,19 +37,19 @@
protected final Log log = LogFactory.getLog(this.getClass());
/**
- * List of links
+ * Base url for host reference
*/
- private Collection<Link> links = new ArrayList<Link>();
+ private final Link base;
/**
* Map with the pair label-attribute for the accepted items
*/
- private Map<String, String> elements;
+ private final Map<String, String> elements;
/**
- * Base url for host reference
+ * List of links
*/
- private Link base = null;
+ private Collection<Link> links = new ArrayList<Link>();
/**
* Set of URIs visited yet
@@ -61,6 +61,12 @@
*/
private URI link = null;
+ public LinkExtractor(Link base, Map<String, String> elements) {
+ super();
+ this.base = base;
+ this.elements = elements;
+ }
+
@Override
public void startDocument() throws SAXException {
history = new HashSet<String>();
@@ -113,10 +119,6 @@
}
}
- public void setBase(Link base) {
- this.base = base;
- }
-
public Collection<Link> getLinks() {
return links;
}
@@ -125,10 +127,6 @@
return elements;
}
- public void setElements(Map<String, String> elements) {
- this.elements = elements;
- }
-
/**
* Transform a String into an URI.
* @param target the URI in String format.
Modified:
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java?rev=714116&r1=714115&r2=714116&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java
(original)
+++
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java
Fri Nov 14 11:53:13 2008
@@ -13,6 +13,7 @@
import org.apache.droids.helper.factories.ParserFactory;
import org.apache.droids.helper.factories.ProtocolFactory;
import org.apache.droids.helper.factories.URLFiltersFactory;
+import org.apache.droids.impl.DefaultTaskExceptionHandler;
import org.apache.droids.impl.SequentialTaskMaster;
import org.apache.droids.impl.SimpleTaskQueue;
import org.apache.droids.parse.html.HtmlParser;
@@ -80,6 +81,7 @@
SequentialTaskMaster<Link> taskMaster = new SequentialTaskMaster<Link>();
taskMaster.setDelayTimer( simpleDelayTimer );
+ taskMaster.setExceptionHandler(new DefaultTaskExceptionHandler());
CrawlingDroid crawler = new CrawlingDroid( simpleQueue, taskMaster );
crawler.setFiltersFactory(filtersFactory);
Modified:
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java?rev=714116&r1=714115&r2=714116&view=diff
==============================================================================
---
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
(original)
+++
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
Fri Nov 14 11:53:13 2008
@@ -18,6 +18,8 @@
import java.io.IOException;
import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
import org.apache.droids.ParseData;
import org.apache.droids.api.ContentEntity;
@@ -40,19 +42,19 @@
private org.apache.tika.parser.Parser parser = null;
- private LinkExtractor extractor = null;
+ private Map<String, String> elements= null;
-
- public LinkExtractor getExtractor() {
- return extractor;
+ public Map<String, String> getElements() {
+ if (elements == null) {
+ elements = new HashMap<String, String>();
+ }
+ return elements;
}
-
- public void setExtractor(LinkExtractor extractor) {
- this.extractor = extractor;
+ public void setElements(Map<String, String> elements) {
+ this.elements = elements;
}
-
public Parse getParse(ContentEntity entity, Link link) throws IOException,
DroidsException {
// Init Tika objects
parser = new AutoDetectParser();
@@ -63,7 +65,7 @@
charset = "UTF-8";
}
EchoHandler data = new EchoHandler(charset);
- extractor.setBase(link);
+ LinkExtractor extractor = new LinkExtractor(link, elements);
TeeContentHandler parallelHandler = new TeeContentHandler(data, extractor);