Production ready for indexing in Solr?

Lochschmied, Alexander Thu, 06 Sep 2012 08:05:22 -0700

Hi,

we are looking for a simple but fast Java framework to crawl all pages of our 
website. Ultimate goal is to index certain parts of the page in a Solr search 
system.


The crawler must obviously read the HTML. Is it possible to get the page 
content from Droids without reading the HTTP stream again?
I am not sure if below test code is ok, but it seemed to return same URLs 
twice. That's why I tried to fix that in the "MyCrawlingDroid" class, but I 
assume that's the wrong place anyway. Different HTTP GET URL parameters in 
links found should be treated as different links. I saw Droids may have a 
problem with that? https://issues.apache.org/jira/browse/DROIDS-144

The reason why we do not yet "simply" use Nutch is that we already have Java 
code to index other data sources in Solr. So it would be nice to be able to 
integrate a Crawler framework in this code and reuse our other 
processing/indexing logic.

I have not found good examples that fit in our (Guice based) system. Can you 
recommend something or is there not much point in trying to use Droids for such 
a system (yet)?

Thanks,
Alexander

import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Link;
import org.apache.droids.api.TaskMaster;
import org.apache.droids.api.Worker;
import org.apache.droids.delay.SimpleDelayTimer;
import org.apache.droids.exception.DroidsException;
import org.apache.droids.handle.SysoutHandler;
import org.apache.droids.helper.factories.DroidFactory;
import org.apache.droids.helper.factories.HandlerFactory;
import org.apache.droids.helper.factories.ParserFactory;
import org.apache.droids.helper.factories.ProtocolFactory;
import org.apache.droids.helper.factories.URLFiltersFactory;
import org.apache.droids.impl.DefaultTaskExceptionHandler;
import org.apache.droids.impl.SequentialTaskMaster;
import org.apache.droids.net.RegexURLFilter;
import org.apache.droids.parse.html.HtmlParser;
import org.apache.droids.protocol.http.DroidsHttpClient;
import org.apache.droids.protocol.http.HttpProtocol;
import org.apache.droids.robot.crawler.CrawlingDroid;
import org.apache.droids.robot.crawler.CrawlingWorker;
import org.apache.http.HttpVersion;
import org.apache.http.conn.params.ConnManagerParamBean;
import org.apache.http.conn.params.ConnPerRouteBean;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpConnectionParamBean;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParamBean;
import org.apache.http.protocol.HTTP;

public class VishayIndexerCrawler {

    public static void main(String[] args) throws Exception {

//        if (args.length < 1) {
//            System.out.println("Please specify a URL to crawl");
//            System.exit(-1);
//        }
        String targetURL = "http://www.vishay.com";; // args[0];

        // Create parser factory. Support basic HTML markup only
        ParserFactory parserFactory = new ParserFactory();
        HtmlParser htmlParser = new HtmlParser();
        htmlParser.setElements(new HashMap<String, String>());
        htmlParser.getElements().put("a", "href");
        htmlParser.getElements().put("link", "href");
//        htmlParser.getElements().put("img", "src");
//        htmlParser.getElements().put("script", "src");
        parserFactory.getMap().put("text/html", htmlParser);

        // Create protocol factory. Support HTTP/S only.
        ProtocolFactory protocolFactory = new ProtocolFactory();

        // Create and configure HTTP client
        HttpParams params = new BasicHttpParams();
        HttpProtocolParamBean hppb = new HttpProtocolParamBean(params);
        HttpConnectionParamBean hcpb = new HttpConnectionParamBean(params);
        ConnManagerParamBean cmpb = new ConnManagerParamBean(params);

        // Set protocol parametes
        hppb.setVersion(HttpVersion.HTTP_1_1);
        hppb.setContentCharset(HTTP.ISO_8859_1);
        hppb.setUseExpectContinue(true);
        // Set connection parameters
        hcpb.setStaleCheckingEnabled(false);
        // Set connection manager parameters
        ConnPerRouteBean connPerRouteBean = new ConnPerRouteBean();
        connPerRouteBean.setDefaultMaxPerRoute(2);
        cmpb.setConnectionsPerRoute(connPerRouteBean);

        DroidsHttpClient httpclient = new DroidsHttpClient(params);

        HttpProtocol httpProtocol = new HttpProtocol(httpclient);
        protocolFactory.getMap().put("http", httpProtocol);
        protocolFactory.getMap().put("https", httpProtocol);

        // Create URL filter factory.
        URLFiltersFactory filtersFactory = new URLFiltersFactory();
        RegexURLFilter defaultURLFilter = new RegexURLFilter();
        defaultURLFilter.setFile("classpath:/regex-urlfilter.txt");
        filtersFactory.getMap().put("default", defaultURLFilter);

        // Create handler factory. Provide sysout handler only.
        HandlerFactory handlerFactory = new HandlerFactory();
        SysoutHandler defaultHandler = new SysoutHandler();
        handlerFactory.getMap().put("default", defaultHandler);

        // Create droid factory. Leave it empty for now.
        DroidFactory<Link> droidFactory = new DroidFactory<Link>();

        // Create default droid
        SimpleDelayTimer simpleDelayTimer = new SimpleDelayTimer();
        simpleDelayTimer.setDelayMillis(100);

        Queue<Link> simpleQueue = new LinkedList<Link>();

        SequentialTaskMaster<Link> taskMaster = new 
SequentialTaskMaster<Link>();
        taskMaster.setDelayTimer(simpleDelayTimer);
        taskMaster.setExceptionHandler(new DefaultTaskExceptionHandler());

        CrawlingDroid helloCrawler = new MyCrawlingDroid(simpleQueue, 
taskMaster);
        helloCrawler.setFiltersFactory(filtersFactory);
        helloCrawler.setParserFactory(parserFactory);
        helloCrawler.setProtocolFactory(protocolFactory);

        Collection<String> initialLocations = new ArrayList<String>();
        initialLocations.add(targetURL);
        helloCrawler.setInitialLocations(initialLocations);

        // Initialize and start the crawler
        helloCrawler.init();
        helloCrawler.start();

        // Await termination
        helloCrawler.getTaskMaster().awaitTermination(0, TimeUnit.MILLISECONDS);
        // Shut down the HTTP connection manager
        httpclient.getConnectionManager().shutdown();
    }

    static class MyCrawlingDroid extends CrawlingDroid {
        final static Set<URI> visited = Collections.synchronizedSet(new 
HashSet<URI>());

        public MyCrawlingDroid(Queue<Link> queue, TaskMaster<Link> taskMaster) {
            super(queue, taskMaster);
        }

        @Override
        public Worker<Link> getNewWorker() {
            final CrawlingWorker worker = new CrawlingWorker(this);
            // 
worker.setHandlerFactory(DroidsFactory.createDefaultHandlerFactory(new 
SysoutHandler()));
            HandlerFactory hf = new HandlerFactory() {
                public boolean handle(URI uri, ContentEntity entity) throws 
DroidsException {
                    if (visited.contains(uri)) {
                        return true;
                    }
//                    entity.getParse().getOutlinks().contains(new Link()) {
//                        System.err.println("THERE IS ONE: " + uri);
//                    }
                    System.out.println(uri);
                    visited.add(uri);
                    return true;
                }
            };
            worker.setHandlerFactory(hf);
            return worker;
        }
    }
}

Production ready for indexing in Solr?

Reply via email to