Hi, all.
 
I have done some quick work integrating the websphinx web crawler with lucene.  I found that it was fairly trivial, and have enclosed some example source code in this mail.  You will need the websphix jar file from
 
 
Obviously, this example could be improved quite a bit, but it is a good starting point for someone wanting to develop a spider.
 
Hope this helps,
 
Scott
 
********************************************
import websphinx.*;
import com.lucene.index.*;
import com.lucene.analysis.*;
import java.net.*;
import java.io.*;
 
public class Index {
    public static void main(String[] args) {
        try {
            IndexWriter writer = new IndexWriter("index", new StopAnalyzer(), true);
            writer.mergeFactor = 20;
            IndexingCrawler c = new IndexingCrawler(writer, "http://www.yahoo.com");
            c.run();
            writer.optimize();
            writer.close();
        } catch (MalformedURLException e) {
            e.printStackTrace(System.out);
        } catch (IOException e) {
            e.printStackTrace(System.out);
        }
    }
}
 
**********************************************
The actual crawler is below
**********************************************
import websphinx.*;
import com.lucene.document.*;
import com.lucene.analysis.*;
import com.lucene.index.*;
import java.io.*;
import java.net.*;
 
public class IndexingCrawler extends Crawler {
 
    private IndexWriter writer;
    public IndexingCrawler(IndexWriter writer, String docroot) {
        super();
        try {
            this.setRoot(new Link(docroot));
        } catch (MalformedURLException e) {
            this.setRoot(null);
        }
        this.writer = writer;
        this.setSynchronous(true);
        this.setDomain(Crawler.SERVER);
    }
 
    public void visit(Page p) {
        boolean index = false;
        System.out.println("Visiting [" + p.getURL() + "]");
        index(p);
        System.out.println("    Done.");
    }
 
    public void index(Page p) {
        StringBuffer contents = new StringBuffer();
        Document doc = new Document();
        doc.add(Field.Text("path", p.getURL().toString()));
        doc.add(Field.Keyword("modified",
     DateField.timeToString(p.getLastModified())));
 
        if (p.getTitle() != null) {
            doc.add(Field.Text("title", p.getTitle()));
        }
 
        System.out.println("    Indexing...");
        System.out.println("        depth [" + p.getDepth() + "]");
        System.out.println("        title [" + p.getTitle() + "]");
        System.out.println("        modified [" + p.getLastModified() + "]");
        Element[] elements = p.getElements();
        for (int i = 0; i < elements.length; i++) {
            if (elements[i].getTagName().equalsIgnoreCase("meta")) {
                String name = elements[i].getHTMLAttribute("name", "");
                String content = elements[i].getHTMLAttribute("content", "");
                if (!name.equals("")) {
                    doc.add(Field.Text(name, content));
                    System.out.println("        meta [" + name + ":" + content + "]");
                }
            }
        }
        Text[] texts = p.getWords();
        for (int i = 0; i < texts.length; i++) {
            contents.append(texts[i].toText());
            contents.append(" ");
        }
        doc.add(Field.Text("contents", contents.toString()));
        try {
            writer.addDocument(doc);
        } catch (IOException e) {
            throw new RuntimeException(e.toString());
        }
    }
 
    public void noindex(Page p) {
        System.out.println("    Skipping...");
    }
}
*****************************

Reply via email to