On Wed, Jul 07, 2004 at 12:57:19AM -0700, [EMAIL PROTECTED] wrote: > On Tue, Jul 06, 2004 at 12:48:13PM -0700, Doug Cutting wrote: > > [EMAIL PROTECTED] wrote: > > >Currently, the definition of one external parser requires > > >synchronized change of both file ./nutch/build/plugins/parse-ext/command > > >and ./nutch/build/plugins/parse-ext/plugin.xml. Ideally > > >it'd better be handled solely by file plugin.xml. However that would > > >require attributes in plugin.xml be passed to ExtParser.java, and > > >I could not figure out a proper way to do it with current plugin system. > > >Stephan and Doug: any suggestion? > > > > Yes, I agree it would be better if a plugin implementation could access > > the attributes easily from plugin.xml. Perhaps we should make > > ParserFactory.getExtension() public, then you can call getAttribute() on > > the returned Extension. Would that work?
It does not do what I want. Anyway I have added junit test to previous patch. Now it is a new one (20040709, attached below, also available at http://nutch.neasys.com/patch/) If no one objects, I will commit it to cvs in two days. John ------------------------- patch.txt.20040709 ----------------------- diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml --exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar nutch-cvs-20040703/conf/nutch-default.xml nutch-cvs-20040709.xing/conf/nutch-default.xml --- nutch-cvs-20040703/conf/nutch-default.xml 2004-06-30 14:54:07.000000000 -0700 +++ nutch-cvs-20040709.xing/conf/nutch-default.xml 2004-07-09 16:31:46.000000000 -0700 @@ -382,4 +382,19 @@ <description>A Directory where nutch plugin are located</description> </property> +<property> + <name>plugin.parse.ext.command</name> + <value>./build/plugins/parse-ext/command</value> + <description>Name or path of an external command that will be invoked + by parse-ext plugin.</description> +</property> + +<property> + <name>plugin.parse.ext.timeout</name> + <value>30</value> + <description>Duration of external command process, in seconds. + External command process will be terminated if timed out. + </description> +</property> + </nutch-conf> diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml --exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar nutch-cvs-20040703/src/java/net/nutch/parse/ParserChecker.java nutch-cvs-20040709.xing/src/java/net/nutch/parse/ParserChecker.java --- nutch-cvs-20040703/src/java/net/nutch/parse/ParserChecker.java 2004-06-15 09:42:29.000000000 -0700 +++ nutch-cvs-20040709.xing/src/java/net/nutch/parse/ParserChecker.java 2004-07-09 16:31:46.000000000 -0700 @@ -53,8 +53,11 @@ Protocol protocol = ProtocolFactory.getProtocol(url); Content content = protocol.getContent(url); - if (!force) + if (force) { + content.setContentType(contentType); + } else { contentType = content.getContentType(); + } if (contentType == null) { System.err.println(""); @@ -62,6 +65,7 @@ } LOG.info("parsing: "+url); + LOG.info("contentType: "+contentType); Parser parser = ParserFactory.getParser(contentType, url); Parse parse = parser.getParse(content); diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml --exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar nutch-cvs-20040703/src/java/net/nutch/util/CommandRunner.java nutch-cvs-20040709.xing/src/java/net/nutch/util/CommandRunner.java --- nutch-cvs-20040703/src/java/net/nutch/util/CommandRunner.java 1969-12-31 16:00:00.000000000 -0800 +++ nutch-cvs-20040709.xing/src/java/net/nutch/util/CommandRunner.java 2004-07-09 16:31:46.000000000 -0700 @@ -0,0 +1,276 @@ +/* Copyright (c) 2004 The Nutch Organization. All rights reserved. */ +/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ + +/* + * Adopted by John Xing for Nutch Project from + * http://blog.fivesight.com/prb/space/Call+an+External+Command+from+Java/, + * which explains the code in detail. + * + * Comments by John Xing on 20040621: + * (1) EDU.oswego.cs.dl.util.concurrent.* is in j2sdk 1.5 now. + * Modifications are needed if we move to j2sdk 1.5. + * (2) The original looks good, not much to change. + * + * This code is in the public domain and comes with no warranty. + */ +package net.nutch.util; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +import EDU.oswego.cs.dl.util.concurrent.BrokenBarrierException; +import EDU.oswego.cs.dl.util.concurrent.CyclicBarrier; +import EDU.oswego.cs.dl.util.concurrent.TimeoutException; + +public class CommandRunner { + + private boolean _waitForExit = true; + private String _command; + private int _timeout = 10; + private boolean _destroyOnTimeout = true; + + private InputStream _stdin; + private OutputStream _stdout; + private OutputStream _stderr; + + private static final int BUF = 4096; + + private int _xit; + + private Throwable _thrownError; + + private CyclicBarrier _barrier; + + public int getExitValue() { + return _xit; + } + + public void setCommand(String s) { + _command = s; + } + + public String getCommand() { + return _command; + } + + public void setInputStream(InputStream is) { + _stdin = is; + } + + public void setStdOutputStream(OutputStream os) { + _stdout = os; + } + + public void setStdErrorStream(OutputStream os) { + _stderr = os; + } + + public void evaluate() throws IOException { + Process proc = Runtime.getRuntime().exec(_command); + + _barrier = new CyclicBarrier(3 + ((_stdin != null) ? 1 : 0)); + + PullerThread so = + new PullerThread("STDOUT", proc.getInputStream(), _stdout); + so.start(); + + PullerThread se = + new PullerThread("STDERR", proc.getErrorStream(), _stderr); + se.start(); + + PusherThread si = null; + if (_stdin != null) { + si = new PusherThread("STDIN", _stdin, proc.getOutputStream()); + si.start(); + } + + boolean _timedout = false; + long end = System.currentTimeMillis() + _timeout * 1000; + + try { + if (_timeout == 0) { + _barrier.barrier(); + } else { + _barrier.attemptBarrier(_timeout * 1000); + } + } catch (TimeoutException ex) { + _timedout = true; + if (si != null) { + si.interrupt(); + } + so.interrupt(); + se.interrupt(); + if (_destroyOnTimeout) { + proc.destroy(); + } + } catch (BrokenBarrierException bbe) { + /* IGNORE */ + } catch (InterruptedException e) { + /* IGNORE */ + } + + _xit = -1; + + if (!_timedout) { + if (_waitForExit) { + do { + try { + _xit = proc.exitValue(); + Thread.sleep(250); + } catch (InterruptedException ie) { + /* IGNORE */ + } catch (IllegalThreadStateException iltse) { + continue; + } + break; + } while (!(_timedout = (System.currentTimeMillis() > end))); + } else { + try { + _xit = proc.exitValue(); + } catch (IllegalThreadStateException iltse) { + _timedout = true; + } + } + } + + if (_timedout) { + if (_destroyOnTimeout) { + proc.destroy(); + } + } + } + + public Throwable getThrownError() { + return _thrownError; + } + + private class PumperThread extends Thread { + + private OutputStream _os; + private InputStream _is; + + private volatile boolean _kaput; + + private boolean _closeInput; + + protected PumperThread( + String name, + InputStream is, + OutputStream os, + boolean closeInput) { + super(name); + _is = is; + _os = os; + _closeInput = closeInput; + } + + public void run() { + _kaput = false; + try { + byte[] buf = new byte[BUF]; + int read = 0; + while (!isInterrupted() && (read = _is.read(buf)) != -1) { + if (read == 0) + continue; + _os.write(buf, 0, read); + _os.flush(); + } + } catch (Throwable t) { + _thrownError = t; + return; + } finally { + try { + if (_closeInput) { + _is.close(); + } else { + _os.close(); + } + } catch (IOException ioe) { + /* IGNORE */ + } + } + try { + _barrier.barrier(); + } catch (InterruptedException ie) { + /* IGNORE */ + } catch (BrokenBarrierException bbe) { + /* IGNORE */ + } + } + } + + private class PusherThread extends PumperThread { + PusherThread(String name, InputStream is, OutputStream os) { + super(name, is, os, false); + } + } + + private class PullerThread extends PumperThread { + PullerThread(String name, InputStream is, OutputStream os) { + super(name, is, os, true); + } + } + + public int getTimeout() { + return _timeout; + } + + public void setTimeout(int timeout) { + _timeout = timeout; + } + + public boolean getDestroyOnTimeout() { + return _destroyOnTimeout; + } + + public void setDestroyOnTimeout(boolean destroyOnTimeout) { + _destroyOnTimeout = destroyOnTimeout; + } + + public boolean getWaitForExit() { + return _waitForExit; + } + + public void setWaitForExit(boolean waitForExit) { + _waitForExit = waitForExit; + } + + public static void main(String[] args) throws Exception { + String commandPath = null; + String filePath = null; + int timeout = 10; + + String usage = "Usage: CommandRunner [-timeout timeout] commandPath filePath"; + + if (args.length < 2) { + System.err.println(usage); + System.exit(-1); + } + + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-timeout")) { + timeout = Integer.parseInt(args[++i]);; + } else if (i != args.length-2) { + System.err.println(usage); + System.exit(-1); + } else { + commandPath = args[i]; + filePath = args[++i]; + } + } + + CommandRunner cr = new CommandRunner(); + + cr.setCommand(commandPath); + cr.setInputStream(new java.io.FileInputStream(filePath)); + cr.setStdErrorStream(System.err); + cr.setStdOutputStream(System.out); + + cr.setTimeout(timeout); + + cr.evaluate(); + + System.err.println("output value: "+cr.getExitValue()); + } +} diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml --exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar nutch-cvs-20040703/src/plugin/build.xml nutch-cvs-20040709.xing/src/plugin/build.xml --- nutch-cvs-20040703/src/plugin/build.xml 2004-06-30 15:15:51.000000000 -0700 +++ nutch-cvs-20040709.xing/src/plugin/build.xml 2004-07-09 16:34:06.000000000 -0700 @@ -13,6 +13,7 @@ <ant dir="parse-text" target="deploy"/> <ant dir="parse-pdf" target="deploy"/> <ant dir="parse-msword" target="deploy"/> + <ant dir="parse-ext" target="deploy"/> <ant dir="index-basic" target="deploy"/> <ant dir="creativecommons" target="deploy"/> <ant dir="languageidentifier" target="deploy"/> @@ -24,6 +25,7 @@ <target name="test"> <ant dir="protocol-http" target="test"/> <ant dir="parse-html" target="test"/> + <ant dir="parse-ext" target="test"/> </target> <!-- ====================================================== --> @@ -37,6 +39,7 @@ <ant dir="parse-text" target="clean"/> <ant dir="parse-pdf" target="clean"/> <ant dir="parse-msword" target="clean"/> + <ant dir="parse-ext" target="clean"/> <ant dir="index-basic" target="clean"/> <ant dir="creativecommons" target="clean"/> <ant dir="languageidentifier" target="clean"/> diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml --exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar nutch-cvs-20040703/src/plugin/parse-ext/build.xml nutch-cvs-20040709.xing/src/plugin/parse-ext/build.xml --- nutch-cvs-20040703/src/plugin/parse-ext/build.xml 1969-12-31 16:00:00.000000000 -0800 +++ nutch-cvs-20040709.xing/src/plugin/parse-ext/build.xml 2004-07-09 16:31:46.000000000 -0700 @@ -0,0 +1,10 @@ +<?xml version="1.0"?> + +<project name="parse-ext" default="jar"> + + <import file="../build-plugin.xml"/> + + <copy file="command" todir="${deploy.dir}" preservelastmodified="true"/> + <chmod file="${deploy.dir}/command" perm="755"/> + +</project> diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml --exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar nutch-cvs-20040703/src/plugin/parse-ext/command nutch-cvs-20040709.xing/src/plugin/parse-ext/command --- nutch-cvs-20040703/src/plugin/parse-ext/command 1969-12-31 16:00:00.000000000 -0800 +++ nutch-cvs-20040709.xing/src/plugin/parse-ext/command 2004-07-09 16:31:46.000000000 -0700 @@ -0,0 +1,24 @@ +#!/bin/bash +# +# Sample bash script as external command invoked by parse-ext plugin +# +# 20040701, John Xing + +set -e + +if [ $# -ne 1 ]; then + echo Usage:$0 mimeType >&2 + exit 1 +fi + +case $1 in +"application/vnd.nutch.example.cat") + cat + ;; +"application/vnd.nutch.example.md5sum") + md5sum + ;; +*) + echo "Can't parse mimeType $1" >&2 + exit 1 +esac diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml --exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar nutch-cvs-20040703/src/plugin/parse-ext/plugin.xml nutch-cvs-20040709.xing/src/plugin/parse-ext/plugin.xml --- nutch-cvs-20040703/src/plugin/parse-ext/plugin.xml 1969-12-31 16:00:00.000000000 -0800 +++ nutch-cvs-20040709.xing/src/plugin/parse-ext/plugin.xml 2004-07-09 16:31:46.000000000 -0700 @@ -0,0 +1,34 @@ +<?xml version="1.0" encoding="UTF-8"?> +<plugin + id="parse-ext" + name="External Parser Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + <extension-point + id="net.nutch.parse.Parser" + name="Nutch Content Parser"/> + + <runtime> + <library name="parse-ext.jar"> + <export name="*"/> + </library> + </runtime> + + <extension id="net.nutch.parse.ext" + name="ExtParse" + point="net.nutch.parse.Parser"> + + <implementation id="net.nutch.parse.ext.ExtParser" + class="net.nutch.parse.ext.ExtParser" + contentType="application/vnd.nutch.example.cat" + pathSuffix=""/> + + <implementation id="net.nutch.parse.ext.ExtParser" + class="net.nutch.parse.ext.ExtParser" + contentType="application/vnd.nutch.example.md5sum" + pathSuffix=""/> + + </extension> + +</plugin> diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml --exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar nutch-cvs-20040703/src/plugin/parse-ext/src/java/net/nutch/parse/ext/ExtParser.java nutch-cvs-20040709.xing/src/plugin/parse-ext/src/java/net/nutch/parse/ext/ExtParser.java --- nutch-cvs-20040703/src/plugin/parse-ext/src/java/net/nutch/parse/ext/ExtParser.java 1969-12-31 16:00:00.000000000 -0800 +++ nutch-cvs-20040709.xing/src/plugin/parse-ext/src/java/net/nutch/parse/ext/ExtParser.java 2004-07-09 16:31:46.000000000 -0700 @@ -0,0 +1,107 @@ +/* Copyright (c) 2004 The Nutch Organization. All rights reserved. */ +/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ + +package net.nutch.parse.ext; + +import net.nutch.protocol.Content; +import net.nutch.parse.Parser; +import net.nutch.parse.Parse; +import net.nutch.parse.ParseData; +import net.nutch.parse.ParseImpl; +import net.nutch.parse.Outlink; +import net.nutch.parse.ParseException; + +import net.nutch.util.LogFormatter; +import net.nutch.util.NutchConf; +import net.nutch.util.CommandRunner; + +import java.util.Properties; +import java.util.logging.Logger; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; + +/********************************************* + * A wrapper that invokes external command to do real parsing job. + * + * @author John Xing + *********************************************/ + +public class ExtParser implements Parser { + public static final Logger LOG = + LogFormatter.getLogger("net.nutch.parse.ext"); + + private final int BUFFER_SIZE = 4096; + + private final String COMMAND = + NutchConf.get("plugin.parse.ext.command", + "./build/plugins/parse-ext/command"); + + private final int TIMEOUT = + NutchConf.getInt("plugin.parse.ext.timeout",30); + + public ExtParser () {} + + public Parse getParse(Content content) throws ParseException { + + String contentType = content.getContentType(); + + String text = null; + String title = null; + + try { + + byte[] raw = content.getContent(); + + String contentLength = + (String)content.getMetadata().get("Content-Length"); + if (contentLength != null + && raw.length != Integer.parseInt(contentLength)) { + throw new ParseException("Content truncated at "+raw.length + +" bytes. Parser can't handle incomplete "+contentType+" file."); + } + + ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE); + ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE/4); + + CommandRunner cr = new CommandRunner(); + + cr.setCommand(COMMAND + " " +contentType); + cr.setInputStream(new ByteArrayInputStream(raw)); + cr.setStdOutputStream(os); + cr.setStdErrorStream(es); + + cr.setTimeout(TIMEOUT); + + cr.evaluate(); + + if (cr.getExitValue() != 0) + throw new ParseException("External command "+COMMAND + +" failed with error: "+es.toString()); + + text = os.toString(); + + } catch (ParseException e) { + throw e; + } catch (Exception e) { // run time exception + throw new ParseException("ExtParser failed. "+e); + } + + if (text == null) + text = ""; + + if (title == null) + title = ""; + + // collect outlink + Outlink[] outlinks = new Outlink[0]; + + // collect meta data + Properties metaData = new Properties(); + metaData.putAll(content.getMetadata()); // copy through + + ParseData parseData = new ParseData(title, outlinks, metaData); + return new ParseImpl(text, parseData); + } + +} diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml --exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar nutch-cvs-20040703/src/plugin/parse-ext/src/test/net/nutch/parse/ext/TestExtParser.java nutch-cvs-20040709.xing/src/plugin/parse-ext/src/test/net/nutch/parse/ext/TestExtParser.java --- nutch-cvs-20040703/src/plugin/parse-ext/src/test/net/nutch/parse/ext/TestExtParser.java 1969-12-31 16:00:00.000000000 -0800 +++ nutch-cvs-20040709.xing/src/plugin/parse-ext/src/test/net/nutch/parse/ext/TestExtParser.java 2004-07-09 17:20:26.000000000 -0700 @@ -0,0 +1,97 @@ +/* Copyright (c) 2004 The Nutch Organization. All rights reserved. */ +/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ + +package net.nutch.parse.ext; + +import net.nutch.protocol.ProtocolFactory; +import net.nutch.protocol.Protocol; +import net.nutch.protocol.Content; +import net.nutch.protocol.ProtocolException; + +import net.nutch.parse.ParserFactory; +import net.nutch.parse.Parser; +import net.nutch.parse.Parse; +import net.nutch.parse.ParseException; + +import junit.framework.TestCase; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; + +/** + * Unit tests for ExtParser. + * First creates a temp file with fixed content, then fetch + * and parse it using external command 'cat' and 'md5sum' alternately + * for 10 times. Doing so also does a light stress test for class + * CommandRunner.java (as used in ExtParser.java). + * + * Warning: currently only do test on linux platform. + * + * @author John Xing + */ +public class TestExtParser extends TestCase { + private File tempFile = null; + private String urlString = null; + private Content content = null;; + private Parser parser = null;; + private Parse parse = null; + + private String expectedText = "nutch rocks nutch rocks nutch rocks"; + // echo -n "nutch rocks nutch rocks nutch rocks" | md5sum + private String expectedMD5sum = "df46711a1a48caafc98b1c3b83aa1526"; + + public TestExtParser(String name) { + super(name); + } + + protected void setUp() throws ProtocolException, IOException { + // prepare a temp file with expectedText as its content + tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",""); + urlString = tempFile.toURL().toString(); + + FileOutputStream fos = new FileOutputStream(tempFile); + fos.write(expectedText.getBytes()); + fos.close(); + + // get nutch content for tempFile + Protocol protocol = ProtocolFactory.getProtocol(urlString); + content = protocol.getContent(urlString); + protocol = null; + } + + protected void tearDown() { + // clean content + content = null; + + // clean temp file + if (tempFile != null && tempFile.exists()) + tempFile.delete(); + } + + public void testIt() throws ParseException { + String contentType; + + // now test only on linux platform + if (!System.getProperty("os.name").equalsIgnoreCase("linux")) + return; + + // loop alternately, total 10*2 times of invoking external command + for (int i=0; i<10; i++) { + // check external parser that does 'cat' + contentType = "application/vnd.nutch.example.cat"; + content.setContentType(contentType); + parser = ParserFactory.getParser(contentType, urlString); + parse = parser.getParse(content); + assertEquals(expectedText,parse.getText()); + + // check external parser that does 'md5sum' + contentType = "application/vnd.nutch.example.md5sum"; + content.setContentType(contentType); + parser = ParserFactory.getParser(contentType, urlString); + parse = parser.getParse(content); + assertTrue(parse.getText().startsWith(expectedMD5sum)); + } + } + +} ------------------------------------------------------- This SF.Net email sponsored by Black Hat Briefings & Training. Attend Black Hat Briefings & Training, Las Vegas July 24-29 - digital self defense, top technical experts, no vendor pitches, unmatched networking opportunities. Visit www.blackhat.com _______________________________________________ Nutch-developers mailing list [EMAIL PROTECTED] https://lists.sourceforge.net/lists/listinfo/nutch-developers
