Author: rfrovarp
Date: Tue May 24 19:51:35 2011
New Revision: 1127244
URL: http://svn.apache.org/viewvc?rev=1127244&view=rev
Log:
Patch courtesy Eugen Paraschiv.
This fixes DROIDS-143 so that max crawl depth is honored.
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/TaskValidator.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/URLFilter.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/impl/MaxDepthTaskValidator.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingDroid.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/validator/ChainTaskValidator.java
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/handler/ExceptionReportHandler.java
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/impl/TestSimpleDroid.java
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/impl/TestSimpleQueue.java
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/TaskValidator.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/TaskValidator.java?rev=1127244&r1=1127243&r2=1127244&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/TaskValidator.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/TaskValidator.java
Tue May 24 19:51:35 2011
@@ -16,19 +16,13 @@
*/
package org.apache.droids.api;
-import org.apache.droids.exception.InvalidTaskException;
-
/**
*
* @since 1.0
*/
public interface TaskValidator<T extends Task> {
/**
- * This will take a task and make sure it is valid. It <b>may</b>
- * modify the task so it is valid. For example, a URL may be normalized
- * within the validateTask method.
- *
- * @throws InvalidTaskException
+ * This will take a task and make sure it is valid.
*/
- T validateTask( T task ) throws InvalidTaskException;
+ boolean validate( T task );
}
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/URLFilter.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/URLFilter.java?rev=1127244&r1=1127243&r2=1127244&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/URLFilter.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/URLFilter.java
Tue May 24 19:51:35 2011
@@ -33,7 +33,7 @@ public interface URLFilter {
*
* @param urlString
* the url to filter
- * @return null if the filter excluses the url or the url again if allowed
+ * @return null if the filter excludes the url or the url again if allowed
*/
String filter(String urlString);
}
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/impl/MaxDepthTaskValidator.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/impl/MaxDepthTaskValidator.java?rev=1127244&r1=1127243&r2=1127244&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/impl/MaxDepthTaskValidator.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/impl/MaxDepthTaskValidator.java
Tue May 24 19:51:35 2011
@@ -17,7 +17,6 @@
package org.apache.droids.impl;
import org.apache.droids.api.*;
-import org.apache.droids.exception.InvalidTaskException;
/**
* A simple
@@ -26,20 +25,19 @@ public class MaxDepthTaskValidator<T ext
private int maxDepth = -1;
public MaxDepthTaskValidator() {
-
+ super();
}
- public MaxDepthTaskValidator( int maxDepth ) {
+ public MaxDepthTaskValidator(int maxDepth) {
this.maxDepth = maxDepth;
}
@Override
- public T validateTask(T task) throws InvalidTaskException {
- if( maxDepth > 0 && task.getDepth() > maxDepth ) {
- throw new InvalidTaskException(
- "task exceeds maximum depth: ["+task.getDepth() +" > "+
maxDepth+"]");
+ public boolean validate(final T task) {
+ if (maxDepth > 0 && task.getDepth() > maxDepth) {
+ return false;
}
- return task;
+ return true;
}
public int getMaxDepth() {
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingDroid.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingDroid.java?rev=1127244&r1=1127243&r2=1127244&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingDroid.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingDroid.java
Tue May 24 19:51:35 2011
@@ -27,6 +27,7 @@ import org.apache.droids.AbstractDroid;
import org.apache.droids.LinkTask;
import org.apache.droids.api.Link;
import org.apache.droids.api.TaskMaster;
+import org.apache.droids.api.TaskValidator;
import org.apache.droids.api.Worker;
import org.apache.droids.exception.InvalidTaskException;
import org.apache.droids.helper.factories.ParserFactory;
@@ -40,6 +41,7 @@ public abstract class CrawlingDroid exte
ProtocolFactory protocolFactory;
ParserFactory parserFactory;
URLFiltersFactory filtersFactory;
+ private TaskValidator<Link> linkValidator;
public CrawlingDroid(Queue<Link> queue, TaskMaster<Link> taskMaster)
{
@@ -109,4 +111,15 @@ public abstract class CrawlingDroid exte
{
this.filtersFactory = filtersFactory;
}
+
+ public void setLinkValidator(TaskValidator<Link> linkValidator)
+ {
+ this.linkValidator = linkValidator;
+ }
+
+ public TaskValidator<Link> getLinkValidator()
+ {
+ return linkValidator;
+ }
+
}
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java?rev=1127244&r1=1127243&r2=1127244&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
Tue May 24 19:51:35 2011
@@ -28,6 +28,7 @@ import org.apache.droids.api.ManagedCont
import org.apache.droids.api.Parse;
import org.apache.droids.api.Parser;
import org.apache.droids.api.Protocol;
+import org.apache.droids.api.TaskValidator;
import org.apache.droids.api.Worker;
import org.apache.droids.exception.DroidsException;
import org.apache.droids.helper.factories.HandlerFactory;
@@ -115,13 +116,19 @@ public class CrawlingWorker implements W
protected Collection<Link> getFilteredOutlinks( Parse parse )
{
URLFiltersFactory filters = droid.getFiltersFactory();
+ TaskValidator< Link > linkValidator = droid.getLinkValidator();
// TODO -- make the hashvalue for Outlink...
Map<String,Link> filtered = new LinkedHashMap<String,Link>();
for( Link outlink : parse.getOutlinks() ) {
String id = outlink.getId();
- if (filters.accept(outlink.getId()) && !filtered.containsKey(id)) {
- filtered.put(id,outlink);
+ if (filters.accept(id) && !filtered.containsKey(id)) {
+ if( linkValidator == null ){
+ filtered.put(id,outlink);
+ }
+ else if( linkValidator.validate( outlink ) ){
+ filtered.put(id,outlink);
+ }
}
}
return filtered.values();
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/validator/ChainTaskValidator.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/validator/ChainTaskValidator.java?rev=1127244&r1=1127243&r2=1127244&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/validator/ChainTaskValidator.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/validator/ChainTaskValidator.java
Tue May 24 19:51:35 2011
@@ -21,7 +21,6 @@ import java.util.Set;
import org.apache.droids.api.Task;
import org.apache.droids.api.TaskValidator;
-import org.apache.droids.exception.InvalidTaskException;
/**
* A chain task validator executes a chain of unique validators
@@ -37,16 +36,13 @@ public final class ChainTaskValidator<T
}
@Override
- public final T validateTask(final T task) throws InvalidTaskException {
- T currentResult = task;
+ public final boolean validate(final T task) {
for (final TaskValidator<T> taskValidator : this.validatorChain) {
- currentResult = taskValidator.validateTask(currentResult);
- if (currentResult == null) {
- break;
+ if (!taskValidator.validate(task)) {
+ return false;
}
}
-
- return currentResult;
+ return true;
}
public final void addTaskValidator(final TaskValidator<T> taskValidator) {
Modified:
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/handler/ExceptionReportHandler.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/handler/ExceptionReportHandler.java?rev=1127244&r1=1127243&r2=1127244&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/handler/ExceptionReportHandler.java
(original)
+++
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/handler/ExceptionReportHandler.java
Tue May 24 19:51:35 2011
@@ -1,21 +1,21 @@
/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied. See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
package org.apache.droids.examples.handler;
import java.io.IOException;
@@ -26,13 +26,13 @@ import org.apache.droids.exception.Droid
public class ExceptionReportHandler extends ReportHandler {
+ public ExceptionReportHandler() {
+ super();
+ }
+
@Override
- public void handle(URI uri, ContentEntity entity) throws IOException,
- DroidsException {
+ public void handle(URI uri, ContentEntity entity) throws IOException,
DroidsException {
super.handle(uri, entity);
- if (uri.getPath().equals("/page3_html")) {
- throw new RuntimeException("Oppsie!!!");
- }
}
-
+
}
Modified:
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/impl/TestSimpleDroid.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/impl/TestSimpleDroid.java?rev=1127244&r1=1127243&r2=1127244&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/impl/TestSimpleDroid.java
(original)
+++
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/impl/TestSimpleDroid.java
Tue May 24 19:51:35 2011
@@ -34,38 +34,36 @@ import org.junit.After;
import org.junit.Before;
import org.junit.Test;
-public class TestSimpleDroid
-{
+public class TestSimpleDroid {
protected LocalHttpServer testserver;
-
+
@Before
public void initializeLocalTestServer() {
this.testserver = new LocalHttpServer();
}
-
+
@After
public void shutdownLocalTestServer() throws IOException {
this.testserver.stop();
}
@Test
- public void testBasicCrawling() throws Exception
- {
+ public void testBasicCrawling() throws Exception {
this.testserver.register("*", new ResourceHandler());
this.testserver.start();
-
- String baseURI = "http:/" + this.testserver.getServiceAddress();
- String targetURI = baseURI + "/start_html";
-
-
+
+ String baseURI = "http:/" + this.testserver.getServiceAddress();
+ String targetURI = baseURI + "/start_html";
+
Droid<Link> droid =
DroidsFactory.createSimpleReportCrawlingDroid(targetURI);
-
+
droid.init();
droid.start();
-
- while (!droid.getTaskMaster().awaitTermination(250L,
TimeUnit.MILLISECONDS));
-
+
+ while (!droid.getTaskMaster().awaitTermination(250L,
TimeUnit.MILLISECONDS))
+ ;
+
Assert.assertFalse(ReportHandler.getReport().isEmpty());
Assert.assertEquals(5, ReportHandler.getReport().size());
Assert.assertTrue(ReportHandler.getReport().contains(baseURI +
"/start_html"));
@@ -73,22 +71,20 @@ public class TestSimpleDroid
Assert.assertTrue(ReportHandler.getReport().contains(baseURI +
"/page2_html"));
Assert.assertTrue(ReportHandler.getReport().contains(baseURI +
"/page3_html"));
Assert.assertTrue(ReportHandler.getReport().contains(baseURI +
"/page4_html"));
-
+
ReportHandler.recycle();
}
-
+
@Test
- public void testTerminateCrawlingOnException() throws Exception
- {
+ public void testTerminateCrawlingOnException() throws Exception {
this.testserver.register("*", new ResourceHandler());
this.testserver.start();
-
- String baseURI = "http:/" + this.testserver.getServiceAddress();
- String targetURI = baseURI + "/start_html";
-
- Droid<Link> droid = DroidsFactory.createSimpleExceptionCrawlingDroid(
- targetURI);
-
+
+ String baseURI = "http:/" + this.testserver.getServiceAddress();
+ String targetURI = baseURI + "/start_html";
+
+ Droid<Link> droid =
DroidsFactory.createSimpleExceptionCrawlingDroid(targetURI);
+
TaskMaster<Link> taskMaster = (TaskMaster<Link>) droid.getTaskMaster();
taskMaster.setExceptionHandler(new TaskExceptionHandler() {
@@ -100,18 +96,20 @@ public class TestSimpleDroid
}
});
-
+
droid.init();
droid.start();
- while (!droid.getTaskMaster().awaitTermination(250L,
TimeUnit.MILLISECONDS));
-
+ while (!droid.getTaskMaster().awaitTermination(250L,
TimeUnit.MILLISECONDS))
+ ;
+
Assert.assertFalse(ReportHandler.getReport().isEmpty());
- Assert.assertEquals(4, ReportHandler.getReport().size());
+ Assert.assertEquals(5, ReportHandler.getReport().size());
Assert.assertTrue(ReportHandler.getReport().contains(baseURI +
"/start_html"));
Assert.assertTrue(ReportHandler.getReport().contains(baseURI +
"/page1_html"));
Assert.assertTrue(ReportHandler.getReport().contains(baseURI +
"/page2_html"));
Assert.assertTrue(ReportHandler.getReport().contains(baseURI +
"/page3_html"));
-
+ Assert.assertTrue(ReportHandler.getReport().contains(baseURI +
"/page4_html"));
+
ReportHandler.recycle();
}
Modified:
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/impl/TestSimpleQueue.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/impl/TestSimpleQueue.java?rev=1127244&r1=1127243&r2=1127244&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/impl/TestSimpleQueue.java
(original)
+++
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/impl/TestSimpleQueue.java
Tue May 24 19:51:35 2011
@@ -20,50 +20,45 @@ import java.io.File;
import junit.framework.Assert;
-import org.apache.droids.exception.InvalidTaskException;
import org.apache.droids.robot.walker.FileTask;
+import org.junit.Before;
import org.junit.Test;
-public class TestSimpleQueue
-{
-
- /* @Test
- public void testMaxSize() throws Exception
- {
- SimpleTaskQueue<LinkTask> taskQueue = new SimpleTaskQueue<LinkTask>();
- taskQueue.setMaxSize( 10 );
-
- // we should be able to put in 10 tasks...
- for( int i=0; i<taskQueue.getMaxSize(); i++ ) {
- taskQueue.merge( new LinkTask( null, new URI("http://www/"+i), 0 ) );
- }
- Assert.assertEquals( 10, taskQueue.getSize() );
-
- try {
- taskQueue.merge( new LinkTask( null, new URI("http://xxxx/"), 0 ) );
- Assert.fail( "adding a task should have failed -- it is too big" );
- }
- catch( InvalidTaskException ex ) { }
- }*/
+public class TestSimpleQueue {
+ MaxDepthTaskValidator<FileTask> validator;
+
+ @Before
+ public final void initialize(){
+ validator = new MaxDepthTaskValidator<FileTask>();
+ validator.setMaxDepth(5);
+ }
+
@Test
- public void testMaxDepth() throws Exception
- {
- MaxDepthTaskValidator<FileTask> validator = new
MaxDepthTaskValidator<FileTask>();
- validator.setMaxDepth( 5 );
-
- // Testing directly...
- FileTask task = new FileTask( new File( "" ), 3 );
- validator.validateTask( task ); // don't throw exception
+ public void whenTaskBelowMaxDepthIsValidated_thenTaskIsValid() throws
Exception {
+ final FileTask task = new FileTask(new File(""), 3);
- task = new FileTask( new File( "" ), 5 );
- validator.validateTask( task ); // don't throw exception (can be equal)
-
- task = new FileTask( new File( "" ), 7 );
- try {
- validator.validateTask( task );
- Assert.fail( "should faile because it was too deep" );
- }
- catch( InvalidTaskException ex ) {}
+ boolean isValid = validator.validate(task);
+
+ Assert.assertTrue(isValid);
+ }
+
+ @Test
+ public void whenTaskEqualToMaxDepthIsValidated_thenTaskIsValid() throws
Exception {
+ final FileTask task = new FileTask(new File(""), 5);
+
+ boolean isValid = validator.validate(task);
+
+ Assert.assertTrue(isValid);
+ }
+
+ @Test
+ public void whenTaskOverMaxDepthIsValidated_thenTaskIsNotValid() throws
Exception {
+ final FileTask task = new FileTask(new File(""), 7);
+
+ boolean isValid = validator.validate(task);
+
+ Assert.assertFalse(isValid);
}
+
}