Hi, we are looking for a simple but fast Java framework to crawl all pages of our website. Ultimate goal is to index certain parts of the page in a Solr search system.
The crawler must obviously read the HTML. Is it possible to get the page content from Droids without reading the HTTP stream again? I am not sure if below test code is ok, but it seemed to return same URLs twice. That's why I tried to fix that in the "MyCrawlingDroid" class, but I assume that's the wrong place anyway. Different HTTP GET URL parameters in links found should be treated as different links. I saw Droids may have a problem with that? https://issues.apache.org/jira/browse/DROIDS-144 The reason why we do not yet "simply" use Nutch is that we already have Java code to index other data sources in Solr. So it would be nice to be able to integrate a Crawler framework in this code and reuse our other processing/indexing logic. I have not found good examples that fit in our (Guice based) system. Can you recommend something or is there not much point in trying to use Droids for such a system (yet)? Thanks, Alexander import java.net.URI; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.Queue; import java.util.Set; import java.util.concurrent.TimeUnit; import org.apache.droids.api.ContentEntity; import org.apache.droids.api.Link; import org.apache.droids.api.TaskMaster; import org.apache.droids.api.Worker; import org.apache.droids.delay.SimpleDelayTimer; import org.apache.droids.exception.DroidsException; import org.apache.droids.handle.SysoutHandler; import org.apache.droids.helper.factories.DroidFactory; import org.apache.droids.helper.factories.HandlerFactory; import org.apache.droids.helper.factories.ParserFactory; import org.apache.droids.helper.factories.ProtocolFactory; import org.apache.droids.helper.factories.URLFiltersFactory; import org.apache.droids.impl.DefaultTaskExceptionHandler; import org.apache.droids.impl.SequentialTaskMaster; import org.apache.droids.net.RegexURLFilter; import org.apache.droids.parse.html.HtmlParser; import org.apache.droids.protocol.http.DroidsHttpClient; import org.apache.droids.protocol.http.HttpProtocol; import org.apache.droids.robot.crawler.CrawlingDroid; import org.apache.droids.robot.crawler.CrawlingWorker; import org.apache.http.HttpVersion; import org.apache.http.conn.params.ConnManagerParamBean; import org.apache.http.conn.params.ConnPerRouteBean; import org.apache.http.params.BasicHttpParams; import org.apache.http.params.HttpConnectionParamBean; import org.apache.http.params.HttpParams; import org.apache.http.params.HttpProtocolParamBean; import org.apache.http.protocol.HTTP; public class VishayIndexerCrawler { public static void main(String[] args) throws Exception { // if (args.length < 1) { // System.out.println("Please specify a URL to crawl"); // System.exit(-1); // } String targetURL = "http://www.vishay.com"; // args[0]; // Create parser factory. Support basic HTML markup only ParserFactory parserFactory = new ParserFactory(); HtmlParser htmlParser = new HtmlParser(); htmlParser.setElements(new HashMap<String, String>()); htmlParser.getElements().put("a", "href"); htmlParser.getElements().put("link", "href"); // htmlParser.getElements().put("img", "src"); // htmlParser.getElements().put("script", "src"); parserFactory.getMap().put("text/html", htmlParser); // Create protocol factory. Support HTTP/S only. ProtocolFactory protocolFactory = new ProtocolFactory(); // Create and configure HTTP client HttpParams params = new BasicHttpParams(); HttpProtocolParamBean hppb = new HttpProtocolParamBean(params); HttpConnectionParamBean hcpb = new HttpConnectionParamBean(params); ConnManagerParamBean cmpb = new ConnManagerParamBean(params); // Set protocol parametes hppb.setVersion(HttpVersion.HTTP_1_1); hppb.setContentCharset(HTTP.ISO_8859_1); hppb.setUseExpectContinue(true); // Set connection parameters hcpb.setStaleCheckingEnabled(false); // Set connection manager parameters ConnPerRouteBean connPerRouteBean = new ConnPerRouteBean(); connPerRouteBean.setDefaultMaxPerRoute(2); cmpb.setConnectionsPerRoute(connPerRouteBean); DroidsHttpClient httpclient = new DroidsHttpClient(params); HttpProtocol httpProtocol = new HttpProtocol(httpclient); protocolFactory.getMap().put("http", httpProtocol); protocolFactory.getMap().put("https", httpProtocol); // Create URL filter factory. URLFiltersFactory filtersFactory = new URLFiltersFactory(); RegexURLFilter defaultURLFilter = new RegexURLFilter(); defaultURLFilter.setFile("classpath:/regex-urlfilter.txt"); filtersFactory.getMap().put("default", defaultURLFilter); // Create handler factory. Provide sysout handler only. HandlerFactory handlerFactory = new HandlerFactory(); SysoutHandler defaultHandler = new SysoutHandler(); handlerFactory.getMap().put("default", defaultHandler); // Create droid factory. Leave it empty for now. DroidFactory<Link> droidFactory = new DroidFactory<Link>(); // Create default droid SimpleDelayTimer simpleDelayTimer = new SimpleDelayTimer(); simpleDelayTimer.setDelayMillis(100); Queue<Link> simpleQueue = new LinkedList<Link>(); SequentialTaskMaster<Link> taskMaster = new SequentialTaskMaster<Link>(); taskMaster.setDelayTimer(simpleDelayTimer); taskMaster.setExceptionHandler(new DefaultTaskExceptionHandler()); CrawlingDroid helloCrawler = new MyCrawlingDroid(simpleQueue, taskMaster); helloCrawler.setFiltersFactory(filtersFactory); helloCrawler.setParserFactory(parserFactory); helloCrawler.setProtocolFactory(protocolFactory); Collection<String> initialLocations = new ArrayList<String>(); initialLocations.add(targetURL); helloCrawler.setInitialLocations(initialLocations); // Initialize and start the crawler helloCrawler.init(); helloCrawler.start(); // Await termination helloCrawler.getTaskMaster().awaitTermination(0, TimeUnit.MILLISECONDS); // Shut down the HTTP connection manager httpclient.getConnectionManager().shutdown(); } static class MyCrawlingDroid extends CrawlingDroid { final static Set<URI> visited = Collections.synchronizedSet(new HashSet<URI>()); public MyCrawlingDroid(Queue<Link> queue, TaskMaster<Link> taskMaster) { super(queue, taskMaster); } @Override public Worker<Link> getNewWorker() { final CrawlingWorker worker = new CrawlingWorker(this); // worker.setHandlerFactory(DroidsFactory.createDefaultHandlerFactory(new SysoutHandler())); HandlerFactory hf = new HandlerFactory() { public boolean handle(URI uri, ContentEntity entity) throws DroidsException { if (visited.contains(uri)) { return true; } // entity.getParse().getOutlinks().contains(new Link()) { // System.err.println("THERE IS ONE: " + uri); // } System.out.println(uri); visited.add(uri); return true; } }; worker.setHandlerFactory(hf); return worker; } } }
