Author: olegk
Date: Thu Nov 13 01:47:55 2008
New Revision: 713691
URL: http://svn.apache.org/viewvc?rev=713691&view=rev
Log:
API changes:
* Added ContentEntity interface representing a body of content retrieved from a
URI
* Replaced default HTTP protocol implementation based on URLConnection with one
based on HttpClient 4.0
Added:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/ContentEntity.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpClientContentLoader.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpContentEntity.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java
Removed:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/net/UrlHelper.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/HttpBase.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/MediaType.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/Http.java
Modified:
incubator/droids/trunk/droids-core/pom.xml
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Protocol.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/exception/ProtocolNotFoundException.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/HandlerFactory.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/ProtocolFactory.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/file/FileProtocol.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsHttpClient.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/IndexerWorker.java
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/SimpleRuntime.java
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/localserver/ResourceHandler.java
Modified: incubator/droids/trunk/droids-core/pom.xml
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/pom.xml?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/pom.xml (original)
+++ incubator/droids/trunk/droids-core/pom.xml Thu Nov 13 01:47:55 2008
@@ -52,10 +52,6 @@
<artifactId>commons-logging</artifactId>
</dependency>
<dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- </dependency>
- <dependency>
<groupId>org.apache.geronimo.specs</groupId>
<artifactId>geronimo-stax-api_1.0_spec</artifactId>
</dependency>
Added:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/ContentEntity.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/ContentEntity.java?rev=713691&view=auto
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/ContentEntity.java
(added)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/ContentEntity.java
Thu Nov 13 01:47:55 2008
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.api;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Abstract interface representing a body of content with a particular
+ * MIME type and an optional charset.
+ * <p>
+ * IMPORTANT: implementations of this interface MUST ensure that the content
+ * is repeatable, that is, the content can be consumed more than once.
+ * <p>
+ * IMPORTANT: The consumer of the entity content MUST close the input stream
+ * returned by [EMAIL PROTECTED] #obtainContent()} when finished reading the
content.
+ * The consumer MUST call [EMAIL PROTECTED] #finish()} when the entity is no
longer
+ * needed in order to release underlying resources held by the entity.
+ *
+ * @version 1.0
+ */
+public interface ContentEntity {
+
+ /**
+ * Returns content of the entity as an input stream. This input stream
+ * MUST be closed by the consumer when finished reading content.
+ * <p/>
+ * IMPORTANT: This method MUST return a new instance of [EMAIL PROTECTED]
InputStream}
+ * to ensure the content can be consumed miore than once.
+ *
+ * @return input stream
+ * @throws IOException
+ */
+ InputStream obtainContent() throws IOException;
+
+ /**
+ * Returns MIME type of the entity.
+ *
+ * @return MIME type
+ */
+ String getMimeType();
+
+ /**
+ * Returns charset of the entity if known. Otherwise returns
+ * <code>null</null>.
+ *
+ * @return charset
+ */
+ String getCharset();
+
+ /**
+ * Release all underlying resources held by the entity.
+ */
+ void finish();
+
+}
\ No newline at end of file
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java
Thu Nov 13 01:47:55 2008
@@ -17,7 +17,6 @@
package org.apache.droids.api;
import java.io.IOException;
-import java.io.InputStream;
import org.apache.droids.exception.DroidsException;
@@ -38,5 +37,5 @@
* the link that correspond to the stream
* @return the parse object
*/
- Parse getParse(InputStream openStream, Link link) throws DroidsException,
IOException;
+ Parse getParse(ContentEntity entity, Link link) throws DroidsException,
IOException;
}
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Protocol.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Protocol.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Protocol.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Protocol.java
Thu Nov 13 01:47:55 2008
@@ -17,8 +17,8 @@
package org.apache.droids.api;
import java.io.IOException;
-import java.io.InputStream;
import java.net.MalformedURLException;
+import java.net.URI;
/**
* The protocol interface is a wrapper to hide the underlying implementation of
@@ -37,24 +37,16 @@
* @return true if we can request the url. false if we are forbidden.
* @throws MalformedURLException
*/
- boolean isAllowed(String url) throws MalformedURLException;
+ boolean isAllowed(URI url) throws IOException;
/**
- * Return the stream represent of the url
+ * Return the content entity represent of the url
*
* @param url
* url of the stream we want to open
- * @return the stream of the given url
+ * @return the content of the given url
* @throws IOException
*/
- InputStream openStream(String url) throws IOException;
+ ContentEntity load(URI uri) throws IOException;
- /**
- * Returns the content type of the url
- *
- * @param url
- * url to evaluate
- * @return registered content type
- */
- String getContentType(String url);
}
\ No newline at end of file
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/exception/ProtocolNotFoundException.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/exception/ProtocolNotFoundException.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/exception/ProtocolNotFoundException.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/exception/ProtocolNotFoundException.java
Thu Nov 13 01:47:55 2008
@@ -16,6 +16,8 @@
*/
package org.apache.droids.exception;
+import java.net.URI;
+
/**
* If we do not have any instance of a protocol registered for the iven url.
*
@@ -24,7 +26,7 @@
*/
public class ProtocolNotFoundException extends DroidsException {
private static final long serialVersionUID = 6980937469875896426L;
- private String url = null;
+ private URI uri = null;
/**
* Create an exception for the given url
@@ -32,8 +34,8 @@
* @param url
* url where we do not have a suitable protocol
*/
- public ProtocolNotFoundException(String url) {
- this(url, "protocol not found for url=" + url);
+ public ProtocolNotFoundException(URI uri) {
+ this(uri, "protocol not found for uri=" + uri);
}
/**
@@ -44,9 +46,9 @@
* @param message
* detailed message to explain the underlying cause
*/
- public ProtocolNotFoundException(String url, String message) {
+ public ProtocolNotFoundException(URI uri, String message) {
super(message);
- this.url = url;
+ this.uri = uri;
}
/**
@@ -54,7 +56,7 @@
*
* @return url which has caused the problem
*/
- public String getUrl() {
- return url;
+ public URI getUri() {
+ return uri;
}
}
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/HandlerFactory.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/HandlerFactory.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/HandlerFactory.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/HandlerFactory.java
Thu Nov 13 01:47:55 2008
@@ -16,13 +16,11 @@
*/
package org.apache.droids.helper.factories;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
-import org.apache.commons.io.IOUtils;
+import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Handler;
import org.apache.droids.api.Parse;
import org.apache.droids.exception.DroidsException;
@@ -47,21 +45,15 @@
* the underlying parse object
* @return false if we found a problem, true if all went well
*/
- public boolean handle(InputStream stream, URI uri, Parse parse)
+ public boolean handle(ContentEntity entity, URI uri, Parse parse)
throws DroidsException, IOException {
- byte[] streamCopy = null;
- if(stream==null){
- return false;
- }
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- IOUtils.copy(stream, out);
- streamCopy = out.toByteArray();
-
for (Handler handler : getMap().values()) {
- if (streamCopy == null) {
- return false;
+ InputStream instream = entity.obtainContent();
+ try {
+ handler.handle(instream, uri, parse);
+ } finally {
+ instream.close();
}
- handler.handle(new ByteArrayInputStream(streamCopy), uri, parse);
}
return true;
}
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/ProtocolFactory.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/ProtocolFactory.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/ProtocolFactory.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/ProtocolFactory.java
Thu Nov 13 01:47:55 2008
@@ -16,8 +16,7 @@
*/
package org.apache.droids.helper.factories;
-import java.net.MalformedURLException;
-import java.net.URL;
+import java.net.URI;
import org.apache.droids.api.Protocol;
import org.apache.droids.exception.ProtocolNotFoundException;
@@ -38,13 +37,10 @@
* @return ready to use protocol plugin or null if non have been found
* @throws ProtocolNotFoundException
*/
- public Protocol getProtocol(String uri)
- throws MalformedURLException, ProtocolNotFoundException {
- URL url = null;
+ public Protocol getProtocol(URI uri) throws ProtocolNotFoundException {
Protocol protocol = null;
try {
- url = new URL(uri);
- String protocolName = url.getProtocol();
+ String protocolName = uri.getScheme();
if (protocolName == null) {
throw new ProtocolNotFoundException(uri);
}
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
Thu Nov 13 01:47:55 2008
@@ -24,6 +24,7 @@
import java.util.HashSet;
import java.util.Map;
+import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Link;
import org.apache.droids.api.Parse;
import org.apache.droids.api.Parser;
@@ -42,6 +43,7 @@
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
@@ -69,7 +71,7 @@
private Link link = null;
- public Parse getParse(InputStream stream, Link newLink) throws
DroidsException, IOException {
+ public Parse getParse(ContentEntity entity, Link newLink) throws
DroidsException, IOException {
this.link = newLink;
this.base = newLink.getURI();
ParseData parseData = null;
@@ -80,13 +82,16 @@
final DocumentFragment node = new
HTMLDocumentImpl().createDocumentFragment();
// parse document
// XMLInputSource source = new XMLInputSource(null, uri, uri);
+ InputStream instream = entity.obtainContent();
try {
- parser.parse(base.toString(), node);
+ parser.parse(new InputSource(instream), node);
} catch (SAXException ex) {
throw new ContentFormatViolationException("Failure parsing HTML
content", ex);
+ } finally {
+ instream.close();
}
parseData = extract(node);
- return new ParseImpl(stream.toString(), parseData);
+ return new ParseImpl(newLink.getId(), parseData);
}
private ParseData extract(DocumentFragment node) throws InvalidLinkException
{
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/file/FileProtocol.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/file/FileProtocol.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/file/FileProtocol.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/file/FileProtocol.java
Thu Nov 13 01:47:55 2008
@@ -5,35 +5,71 @@
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.net.MalformedURLException;
+import java.net.URI;
+import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Protocol;
public class FileProtocol implements Protocol {
- FileInputStream fileInputStream=null;
- public String getContentType(String url) {
- // FIXME: to be implemented
- return null;
- }
+ FileInputStream fileInputStream=null;
- public boolean isAllowed(String url) throws MalformedURLException {
- File file = new File(extractLocation(url));
+ public boolean isAllowed(URI uri) {
+ File file = new File(extractLocation(uri));
return file.canRead();
}
- public InputStream openStream(String url) throws IOException {
- url = extractLocation(url);
- fileInputStream = new FileInputStream(url);
- return new BufferedInputStream(fileInputStream);
+ public ContentEntity load(URI uri) throws IOException {
+ File file = new File(extractLocation(uri));
+ return new FileContentEntity(file);
}
- private String extractLocation(String url) {
- final int start = url.indexOf("://");
+ private String extractLocation(URI uri) {
+ String location = uri.toString();
+ final int start = location.indexOf("://");
if(start>-1){
- url = url.substring(start+3);
+ location = location.substring(start+3);
}
- return url;
+ return location;
}
+ static class FileContentEntity implements ContentEntity {
+
+ private final File file;
+ private final String mimeType;
+ private final String charset;
+
+ public FileContentEntity(File file) throws IOException {
+ super();
+ this.file = file;
+ String s = file.getName().toLowerCase();
+ if (s.endsWith(".html") || s.endsWith(".htm")) {
+ this.mimeType = "text/html";
+ this.charset = "ISO-8859-1";
+ } else if (s.endsWith(".txt")) {
+ this.mimeType = "text/plain";
+ this.charset = "ISO-8859-1";
+ } else {
+ this.mimeType = "binary/octet-stream";
+ this.charset = null;
+ }
+ }
+
+ public InputStream obtainContent() throws IOException {
+ return new BufferedInputStream(new FileInputStream(file));
+ }
+
+ public void finish() {
+ }
+
+ public String getMimeType() {
+ return mimeType;
+ }
+
+ public String getCharset() {
+ return charset;
+ }
+
+ }
+
}
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsHttpClient.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsHttpClient.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsHttpClient.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsHttpClient.java
Thu Nov 13 01:47:55 2008
@@ -26,7 +26,10 @@
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.RedirectHandler;
import org.apache.http.client.UserTokenHandler;
+import org.apache.http.client.params.AuthPolicy;
+import org.apache.http.client.protocol.ClientContext;
import org.apache.http.client.protocol.RequestDefaultHeaders;
+import org.apache.http.client.protocol.RequestProxyAuthentication;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.conn.ConnectionKeepAliveStrategy;
import org.apache.http.conn.routing.HttpRoutePlanner;
@@ -36,10 +39,13 @@
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.cookie.CookieSpecRegistry;
import org.apache.http.impl.DefaultConnectionReuseStrategy;
+import org.apache.http.impl.auth.BasicSchemeFactory;
+import org.apache.http.impl.auth.DigestSchemeFactory;
import org.apache.http.impl.client.AbstractHttpClient;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.DefaultConnectionKeepAliveStrategy;
+import org.apache.http.impl.client.DefaultProxyAuthenticationHandler;
import org.apache.http.impl.client.DefaultRedirectHandler;
import org.apache.http.impl.client.DefaultUserTokenHandler;
import org.apache.http.impl.conn.ProxySelectorRoutePlanner;
@@ -95,6 +101,8 @@
httpproc.addInterceptor(new RequestConnControl());
httpproc.addInterceptor(new RequestUserAgent());
httpproc.addInterceptor(new RequestExpectContinue());
+ // HTTP authentication interceptors
+ httpproc.addInterceptor(new RequestProxyAuthentication());
return httpproc;
}
@@ -122,8 +130,14 @@
@Override
protected AuthSchemeRegistry createAuthSchemeRegistry()
{
- // Return empty auth scheme registry. There'll be no auth support
- return new AuthSchemeRegistry();
+ AuthSchemeRegistry registry = new AuthSchemeRegistry();
+ registry.register(
+ AuthPolicy.BASIC,
+ new BasicSchemeFactory());
+ registry.register(
+ AuthPolicy.DIGEST,
+ new DigestSchemeFactory());
+ return registry;
}
@Override
@@ -149,6 +163,12 @@
protected HttpContext createHttpContext()
{
HttpContext context = new BasicHttpContext();
+ context.setAttribute(
+ ClientContext.AUTHSCHEME_REGISTRY,
+ getAuthSchemes());
+ context.setAttribute(
+ ClientContext.CREDS_PROVIDER,
+ getCredentialsProvider());
return context;
}
@@ -167,7 +187,7 @@
@Override
protected AuthenticationHandler createProxyAuthenticationHandler()
{
- return new NoAuthHandler();
+ return new DefaultProxyAuthenticationHandler();
}
@Override
Added:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpClientContentLoader.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpClientContentLoader.java?rev=713691&view=auto
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpClientContentLoader.java
(added)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpClientContentLoader.java
Thu Nov 13 01:47:55 2008
@@ -0,0 +1,81 @@
+/*
+ * ====================================================================
+ *
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ *
+ */
+
+package org.apache.droids.protocol.http;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+
+import org.apache.droids.norobots.ContentLoader;
+import org.apache.http.HttpEntity;
+import org.apache.http.HttpResponse;
+import org.apache.http.HttpStatus;
+import org.apache.http.StatusLine;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.HttpResponseException;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.methods.HttpHead;
+
+/**
+ * [EMAIL PROTECTED] ContentLoader} based on HttpClient 4.0.
+ */
+public class HttpClientContentLoader implements ContentLoader
+{
+
+ private final HttpClient httpclient;
+
+ public HttpClientContentLoader(HttpClient httpclient) {
+ super();
+ this.httpclient = httpclient;
+ }
+
+ public boolean exists(URI uri) throws IOException
+ {
+ HttpHead httphead = new HttpHead(uri);
+ HttpResponse response = httpclient.execute(httphead);
+ return response.getStatusLine().getStatusCode() == HttpStatus.SC_OK;
+ }
+
+ public InputStream load(URI uri) throws IOException {
+ HttpGet httpget = new HttpGet(uri);
+ HttpResponse response = httpclient.execute(httpget);
+ StatusLine statusline = response.getStatusLine();
+ if (statusline.getStatusCode() == HttpStatus.SC_NOT_FOUND) {
+ return null;
+ }
+ if (statusline.getStatusCode() != HttpStatus.SC_OK) {
+ throw new HttpResponseException(
+ statusline.getStatusCode(), statusline.getReasonPhrase());
+ }
+ HttpEntity entity = response.getEntity();
+ if (entity != null) {
+ return entity.getContent();
+ } else {
+ return null;
+ }
+ }
+
+}
Added:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpContentEntity.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpContentEntity.java?rev=713691&view=auto
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpContentEntity.java
(added)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpContentEntity.java
Thu Nov 13 01:47:55 2008
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.protocol.http;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Locale;
+
+import org.apache.droids.api.ContentEntity;
+import org.apache.http.Header;
+import org.apache.http.HeaderElement;
+import org.apache.http.HttpEntity;
+import org.apache.http.NameValuePair;
+import org.apache.http.entity.BufferedHttpEntity;
+import org.apache.http.protocol.HTTP;
+
+public class HttpContentEntity implements ContentEntity {
+
+ private final HttpEntity entity;
+ private final String mimeType;
+ private final String charset;
+
+ public HttpContentEntity(HttpEntity entity) throws IOException {
+ super();
+ if (entity.isRepeatable()) {
+ this.entity = entity;
+ } else {
+ this.entity = new BufferedHttpEntity(entity);
+ }
+
+ String mimeType = null;
+ String charset = null;
+ Header header = entity.getContentType();
+ if (header != null) {
+ HeaderElement[] helems = header.getElements();
+ if (helems != null && helems.length > 0) {
+ HeaderElement helem = helems[0];
+ mimeType = helem.getName();
+ NameValuePair nvp = helem.getParameterByName("charset");
+ if (nvp != null) {
+ charset = nvp.getValue();
+ }
+ }
+ }
+ if (mimeType != null) {
+ this.mimeType = mimeType.toLowerCase(Locale.ENGLISH);
+ } else {
+ this.mimeType = "binary/octet-stream";
+ }
+ if (charset != null) {
+ this.charset = charset;
+ } else {
+ if (this.mimeType.startsWith("text/")) {
+ this.charset = HTTP.ISO_8859_1;
+ } else {
+ this.charset = null;
+ }
+ }
+ }
+
+ public String getMimeType() {
+ return mimeType;
+ }
+
+ public String getCharset() {
+ return charset;
+ }
+
+ public InputStream obtainContent() throws IOException {
+ return entity.getContent();
+ }
+
+ public void finish() {
+ }
+
+}
Added:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java?rev=713691&view=auto
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java
(added)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java
Thu Nov 13 01:47:55 2008
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.protocol.http;
+
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+
+import org.apache.droids.api.ContentEntity;
+import org.apache.droids.api.Protocol;
+import org.apache.droids.helper.Loggable;
+import org.apache.droids.norobots.ContentLoader;
+import org.apache.droids.norobots.NoRobotClient;
+import org.apache.droids.norobots.NoRobotException;
+import org.apache.http.HttpEntity;
+import org.apache.http.HttpResponse;
+import org.apache.http.HttpStatus;
+import org.apache.http.StatusLine;
+import org.apache.http.client.ClientProtocolException;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.HttpResponseException;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.DefaultHttpClient;
+import org.apache.http.params.CoreProtocolPNames;
+
+/**
+ * Protocol handler based on HttpClient 4.0.
+ */
+public class HttpProtocol extends Loggable implements Protocol {
+
+ private final HttpClient httpclient;
+ private final ContentLoader contentLoader;
+
+ private boolean forceAllow = false;
+ private String userAgent = "Apache-Droids/1.1 (java 1.5)";
+
+ public HttpProtocol(final HttpClient httpclient) {
+ super();
+ this.httpclient = httpclient;
+ this.httpclient.getParams().setParameter(CoreProtocolPNames.USER_AGENT,
userAgent);
+ this.contentLoader = new HttpClientContentLoader(httpclient);
+ }
+
+ public HttpProtocol() {
+ this(new DefaultHttpClient());
+ }
+
+ public ContentEntity load(URI uri) throws IOException {
+ HttpGet httpget = new HttpGet(uri);
+ HttpResponse response = httpclient.execute(httpget);
+ StatusLine statusline = response.getStatusLine();
+ if (statusline.getStatusCode() != HttpStatus.SC_OK) {
+ throw new HttpResponseException(
+ statusline.getStatusCode(), statusline.getReasonPhrase());
+ }
+ HttpEntity entity = response.getEntity();
+ if (entity == null) {
+ // Should _almost_ never happen with HTTP GET requests.
+ throw new ClientProtocolException("Empty entity");
+ }
+ return new HttpContentEntity(entity);
+ }
+
+ public boolean isAllowed(URI uri) {
+ if (forceAllow) {
+ return forceAllow;
+ }
+
+ String path = uri.getPath();
+ int i = path.lastIndexOf('/');
+ if (i != -1) {
+ path = path.substring(0, i);
+ } else {
+ path = path + "/";
+ }
+
+ URI baseURI;
+ try {
+ baseURI = new URI(
+ uri.getScheme(), uri.getUserInfo(), uri.getHost(), uri.getPort(),
+ path, null, null);
+ } catch (URISyntaxException ex) {
+ log.error("Unable to determine base URI for " + uri);
+ return false;
+ }
+
+ NoRobotClient nrc = new NoRobotClient(contentLoader, userAgent);
+ try {
+ nrc.parse(baseURI);
+ } catch (NoRobotException ex) {
+ log.error("Failure parsing robots.txt: " + ex.getMessage());
+ return false;
+ } catch (IOException ex) {
+ log.error("I/O error parsing robots.txt: " + ex.getMessage());
+ return false;
+ }
+ boolean test = nrc.isUrlAllowed(uri);
+ String message = (test) ? "allowed" : "denied";
+ if (log.isInfoEnabled()) {
+ log.info("Url is " + message);
+ }
+ return test;
+ }
+
+ public String getUserAgent() {
+ return userAgent;
+ }
+
+ public void setUserAgent(String userAgent) {
+ this.userAgent = userAgent;
+ this.httpclient.getParams().setParameter(CoreProtocolPNames.USER_AGENT,
userAgent);
+ }
+
+ /**
+ * You can force that a site is allowed (ignoring the robots.txt). This
should
+ * only be used on server that you control and where you have the permission
+ * to ignore the robots.txt.
+ *
+ * @return <code>true</code> if you are rude and ignore robots.txt.
+ * <code>false</code> if you are playing nice.
+ */
+ public boolean isForceAllow() {
+ return forceAllow;
+ }
+
+ /**
+ * You can force that a site is allowed (ignoring the robot.txt). This should
+ * only be used on server that you control and where you have the permission
+ * to ignore the robots.txt.
+ *
+ * @param forceAllow
+ * if you want to force an allow and ignore the robot.txt set
+ * to <code>true</code>. If you want to obey the rules and
+ * be polite set to <code>false</code>.
+ */
+ public void setForceAllow(boolean forceAllow) {
+ this.forceAllow = forceAllow;
+ }
+
+}
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
Thu Nov 13 01:47:55 2008
@@ -17,11 +17,12 @@
package org.apache.droids.robot.crawler;
import java.io.IOException;
-import java.io.InputStream;
+import java.net.URI;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.Map;
+import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Link;
import org.apache.droids.api.Parse;
import org.apache.droids.api.Parser;
@@ -42,21 +43,20 @@
public void execute(Link link) throws DroidsException, IOException
{
- InputStream openStream = null;
final String userAgent = this.getClass().getCanonicalName();
if (log.isInfoEnabled()) {
log.info("Starting " + userAgent);
}
- try {
- String url = link.getId();
- final Protocol protocol = droid.getProtocolFactory().getProtocol(url);
- openStream = protocol.openStream(url);
- if (protocol.isAllowed(url)) {
- String contentType = protocol.getContentType(url);
+ URI uri = link.getURI();
+ final Protocol protocol = droid.getProtocolFactory().getProtocol(uri);
+ if (protocol.isAllowed(uri)) {
+ ContentEntity entity = protocol.load(uri);
+ try {
+ String contentType = entity.getMimeType();
if (log.isInfoEnabled()) {
log.info("contentType " + contentType);
}
- if (contentType==null){
+ if (contentType == null){
log.info("missing contentType... can't parse...");
}
else {
@@ -67,35 +67,28 @@
}
}
else {
- Parse parse = parser.getParse(openStream, link);
+ Parse parse = parser.getParse(entity, link);
if( parse.getData() != null ) {
Collection<Link> outlinks = getFilteredOutlinks( parse );
droid.getQueue().merge( outlinks );
}
- handle( parse, openStream, link );
+ handle( parse, entity, link );
}
}
- }
- else {
- log.info("Stopping processing since"
- + " bots are not allowed for this url.");
+ } finally {
+ entity.finish();
}
}
- finally{
- try {
- if (openStream != null) {
- openStream.close();
- }
- } catch (IOException ex) {
- log.error("Error closing stream", ex);
- }
+ else {
+ log.info("Stopping processing since"
+ + " bots are not allowed for this url.");
}
}
- protected void handle( Parse parse, InputStream openStream, Link link )
+ protected void handle( Parse parse, ContentEntity entity, Link link )
throws DroidsException, IOException
{
- droid.getHandlerFactory().handle(openStream, link.getURI(), parse);
+ droid.getHandlerFactory().handle(entity, link.getURI(), parse);
}
protected Collection<Link> getFilteredOutlinks( Parse parse )
Modified:
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java
(original)
+++
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java
Thu Nov 13 01:47:55 2008
@@ -16,7 +16,7 @@
import org.apache.droids.impl.SequentialTaskMaster;
import org.apache.droids.impl.SimpleTaskQueue;
import org.apache.droids.parse.html.HtmlParser;
-import org.apache.droids.protocol.http.Http;
+import org.apache.droids.protocol.http.HttpProtocol;
import org.apache.droids.robot.crawler.CrawlingDroid;
public class DroidsFactory
@@ -37,9 +37,8 @@
public static ProtocolFactory createDefaultProtocolFactory() {
ProtocolFactory protocolFactory = new ProtocolFactory();
- Http httpProtocol = new Http();
+ HttpProtocol httpProtocol = new HttpProtocol();
httpProtocol.setForceAllow(true);
- httpProtocol.setUserAgent("Droids/1.1");
protocolFactory.setMap(new HashMap<String, Object>());
protocolFactory.getMap().put("http", httpProtocol);
Modified:
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/IndexerWorker.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/IndexerWorker.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/IndexerWorker.java
(original)
+++
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/IndexerWorker.java
Thu Nov 13 01:47:55 2008
@@ -16,7 +16,7 @@
*/
package org.apache.droids.examples;
-import java.io.InputStream;
+import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Handler;
import org.apache.droids.api.Link;
import org.apache.droids.api.Parse;
@@ -34,20 +34,8 @@
@Override
- protected void handle( Parse parse, InputStream openStream, Link link )
+ protected void handle( Parse parse, ContentEntity entity, Link link )
{
// TODO -- something different...
-// Handler handler = getHandlerFactory().resolve("solr");
-// try {
-// handler.handle(getProtocol().openStream(getUri()), new URL(getUri()),
parse);
-// } catch (Exception e) {
-// SimpleThreads.threadMessage(e.getMessage());
-// }
-// try {
-// getDroid().finishedWorker(super.getId());
-// } catch (DroidsException e) {
-// // TODO Auto-generated catch block
-// e.printStackTrace();
-// }
}
}
Modified:
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/SimpleRuntime.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/SimpleRuntime.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/SimpleRuntime.java
(original)
+++
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/SimpleRuntime.java
Thu Nov 13 01:47:55 2008
@@ -34,7 +34,7 @@
import org.apache.droids.impl.SimpleTaskQueue;
import org.apache.droids.net.RegexURLFilter;
import org.apache.droids.parse.html.HtmlParser;
-import org.apache.droids.protocol.http.Http;
+import org.apache.droids.protocol.http.HttpProtocol;
/**
* Simple Droids runtime that wires various components together in Java code
@@ -67,8 +67,7 @@
// Create protocol factory. Support HTTP only.
ProtocolFactory protocolFactory = new ProtocolFactory();
- Http httpProtocol = new Http();
- httpProtocol.setForceAllow(true);
+ HttpProtocol httpProtocol = new HttpProtocol();
protocolFactory.setMap(new HashMap<String, Object>());
protocolFactory.getMap().put("http", httpProtocol);
Modified:
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/localserver/ResourceHandler.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/localserver/ResourceHandler.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/localserver/ResourceHandler.java
(original)
+++
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/localserver/ResourceHandler.java
Thu Nov 13 01:47:55 2008
@@ -59,8 +59,10 @@
InputStreamEntity entity = new InputStreamEntity(instream, -1);
if (requestURI.endsWith("_html")) {
entity.setContentType("text/html");
+ entity.setChunked(true);
}
response.setEntity(entity);
+
} else {
response.setStatusCode(HttpStatus.SC_NOT_FOUND);
StringEntity entity = new StringEntity(requestURI + " not found",
"US-ASCII");