Hi, Doug and all,
Here is a patch for invoking external command as parser.
It requires concurrent-1.3.4.jar that can be downloaded from
http://nutch.neasys.com/patch/. For details, please check my note there
under section 20040703.
API EDU.oswego.cs.dl.util.concurrent.*, as provided by concurrent-1.3.4.jar,
is in j2sdk 1.5 now. So this jar won't be with us in the future if we move
to j2sdk 1.5. We might want to use this API for some of our multi-threaded
tools.
Currently, the definition of one external parser requires
synchronized change of both file ./nutch/build/plugins/parse-ext/command
and ./nutch/build/plugins/parse-ext/plugin.xml. Ideally
it'd better be handled solely by file plugin.xml. However that would
require attributes in plugin.xml be passed to ExtParser.java, and
I could not figure out a proper way to do it with current plugin system.
Stephan and Doug: any suggestion?
John
------------------------ patch.txt.20040703 -----------------------
diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml
--exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar
nutch-cvs-20040703/conf/nutch-default.xml
nutch-cvs-20040703.xing/conf/nutch-default.xml
--- nutch-cvs-20040703/conf/nutch-default.xml 2004-06-30 14:54:07.000000000 -0700
+++ nutch-cvs-20040703.xing/conf/nutch-default.xml 2004-07-03 18:42:31.000000000
-0700
@@ -382,4 +382,19 @@
<description>A Directory where nutch plugin are located</description>
</property>
+<property>
+ <name>plugin.parse.ext.command</name>
+ <value>./build/plugins/parse-ext/command</value>
+ <description>Name or path of an external command that will be invoked
+ by parse-ext plugin.</description>
+</property>
+
+<property>
+ <name>plugin.parse.ext.timeout</name>
+ <value>30</value>
+ <description>Duration of external command process, in seconds.
+ External command process will be terminated if timed out.
+ </description>
+</property>
+
</nutch-conf>
diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml
--exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar
nutch-cvs-20040703/src/java/net/nutch/parse/ParserChecker.java
nutch-cvs-20040703.xing/src/java/net/nutch/parse/ParserChecker.java
--- nutch-cvs-20040703/src/java/net/nutch/parse/ParserChecker.java 2004-06-15
09:42:29.000000000 -0700
+++ nutch-cvs-20040703.xing/src/java/net/nutch/parse/ParserChecker.java 2004-07-02
14:30:09.000000000 -0700
@@ -53,8 +53,11 @@
Protocol protocol = ProtocolFactory.getProtocol(url);
Content content = protocol.getContent(url);
- if (!force)
+ if (force) {
+ content.setContentType(contentType);
+ } else {
contentType = content.getContentType();
+ }
if (contentType == null) {
System.err.println("");
@@ -62,6 +65,7 @@
}
LOG.info("parsing: "+url);
+ LOG.info("contentType: "+contentType);
Parser parser = ParserFactory.getParser(contentType, url);
Parse parse = parser.getParse(content);
diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml
--exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar
nutch-cvs-20040703/src/java/net/nutch/util/CommandRunner.java
nutch-cvs-20040703.xing/src/java/net/nutch/util/CommandRunner.java
--- nutch-cvs-20040703/src/java/net/nutch/util/CommandRunner.java 1969-12-31
16:00:00.000000000 -0800
+++ nutch-cvs-20040703.xing/src/java/net/nutch/util/CommandRunner.java 2004-07-03
22:59:25.000000000 -0700
@@ -0,0 +1,276 @@
+/* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+/*
+ * Adopted by John Xing for Nutch Project from
+ * http://blog.fivesight.com/prb/space/Call+an+External+Command+from+Java/,
+ * which explains the code in detail.
+ *
+ * Comments by John Xing on 20040621:
+ * (1) EDU.oswego.cs.dl.util.concurrent.* is in j2sdk 1.5 now.
+ * Modifications are needed if we move to j2sdk 1.5.
+ * (2) The original looks good, not much to change.
+ *
+ * This code is in the public domain and comes with no warranty.
+ */
+package net.nutch.util;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import EDU.oswego.cs.dl.util.concurrent.BrokenBarrierException;
+import EDU.oswego.cs.dl.util.concurrent.CyclicBarrier;
+import EDU.oswego.cs.dl.util.concurrent.TimeoutException;
+
+public class CommandRunner {
+
+ private boolean _waitForExit = true;
+ private String _command;
+ private int _timeout = 10;
+ private boolean _destroyOnTimeout = true;
+
+ private InputStream _stdin;
+ private OutputStream _stdout;
+ private OutputStream _stderr;
+
+ private static final int BUF = 4096;
+
+ private int _xit;
+
+ private Throwable _thrownError;
+
+ private CyclicBarrier _barrier;
+
+ public int getExitValue() {
+ return _xit;
+ }
+
+ public void setCommand(String s) {
+ _command = s;
+ }
+
+ public String getCommand() {
+ return _command;
+ }
+
+ public void setInputStream(InputStream is) {
+ _stdin = is;
+ }
+
+ public void setStdOutputStream(OutputStream os) {
+ _stdout = os;
+ }
+
+ public void setStdErrorStream(OutputStream os) {
+ _stderr = os;
+ }
+
+ public void evaluate() throws IOException {
+ Process proc = Runtime.getRuntime().exec(_command);
+
+ _barrier = new CyclicBarrier(3 + ((_stdin != null) ? 1 : 0));
+
+ PullerThread so =
+ new PullerThread("STDOUT", proc.getInputStream(), _stdout);
+ so.start();
+
+ PullerThread se =
+ new PullerThread("STDERR", proc.getErrorStream(), _stderr);
+ se.start();
+
+ PusherThread si = null;
+ if (_stdin != null) {
+ si = new PusherThread("STDIN", _stdin, proc.getOutputStream());
+ si.start();
+ }
+
+ boolean _timedout = false;
+ long end = System.currentTimeMillis() + _timeout * 1000;
+
+ try {
+ if (_timeout == 0) {
+ _barrier.barrier();
+ } else {
+ _barrier.attemptBarrier(_timeout * 1000);
+ }
+ } catch (TimeoutException ex) {
+ _timedout = true;
+ if (si != null) {
+ si.interrupt();
+ }
+ so.interrupt();
+ se.interrupt();
+ if (_destroyOnTimeout) {
+ proc.destroy();
+ }
+ } catch (BrokenBarrierException bbe) {
+ /* IGNORE */
+ } catch (InterruptedException e) {
+ /* IGNORE */
+ }
+
+ _xit = -1;
+
+ if (!_timedout) {
+ if (_waitForExit) {
+ do {
+ try {
+ _xit = proc.exitValue();
+ Thread.sleep(250);
+ } catch (InterruptedException ie) {
+ /* IGNORE */
+ } catch (IllegalThreadStateException iltse) {
+ continue;
+ }
+ break;
+ } while (!(_timedout = (System.currentTimeMillis() > end)));
+ } else {
+ try {
+ _xit = proc.exitValue();
+ } catch (IllegalThreadStateException iltse) {
+ _timedout = true;
+ }
+ }
+ }
+
+ if (_timedout) {
+ if (_destroyOnTimeout) {
+ proc.destroy();
+ }
+ }
+ }
+
+ public Throwable getThrownError() {
+ return _thrownError;
+ }
+
+ private class PumperThread extends Thread {
+
+ private OutputStream _os;
+ private InputStream _is;
+
+ private volatile boolean _kaput;
+
+ private boolean _closeInput;
+
+ protected PumperThread(
+ String name,
+ InputStream is,
+ OutputStream os,
+ boolean closeInput) {
+ super(name);
+ _is = is;
+ _os = os;
+ _closeInput = closeInput;
+ }
+
+ public void run() {
+ _kaput = false;
+ try {
+ byte[] buf = new byte[BUF];
+ int read = 0;
+ while (!isInterrupted() && (read = _is.read(buf)) != -1) {
+ if (read == 0)
+ continue;
+ _os.write(buf, 0, read);
+ _os.flush();
+ }
+ } catch (Throwable t) {
+ _thrownError = t;
+ return;
+ } finally {
+ try {
+ if (_closeInput) {
+ _is.close();
+ } else {
+ _os.close();
+ }
+ } catch (IOException ioe) {
+ /* IGNORE */
+ }
+ }
+ try {
+ _barrier.barrier();
+ } catch (InterruptedException ie) {
+ /* IGNORE */
+ } catch (BrokenBarrierException bbe) {
+ /* IGNORE */
+ }
+ }
+ }
+
+ private class PusherThread extends PumperThread {
+ PusherThread(String name, InputStream is, OutputStream os) {
+ super(name, is, os, false);
+ }
+ }
+
+ private class PullerThread extends PumperThread {
+ PullerThread(String name, InputStream is, OutputStream os) {
+ super(name, is, os, true);
+ }
+ }
+
+ public int getTimeout() {
+ return _timeout;
+ }
+
+ public void setTimeout(int timeout) {
+ _timeout = timeout;
+ }
+
+ public boolean getDestroyOnTimeout() {
+ return _destroyOnTimeout;
+ }
+
+ public void setDestroyOnTimeout(boolean destroyOnTimeout) {
+ _destroyOnTimeout = destroyOnTimeout;
+ }
+
+ public boolean getWaitForExit() {
+ return _waitForExit;
+ }
+
+ public void setWaitForExit(boolean waitForExit) {
+ _waitForExit = waitForExit;
+ }
+
+ public static void main(String[] args) throws Exception {
+ String commandPath = null;
+ String filePath = null;
+ int timeout = 10;
+
+ String usage = "Usage: CommandRunner [-timeout timeout] commandPath filePath";
+
+ if (args.length < 2) {
+ System.err.println(usage);
+ System.exit(-1);
+ }
+
+ for (int i = 0; i < args.length; i++) {
+ if (args[i].equals("-timeout")) {
+ timeout = Integer.parseInt(args[++i]);;
+ } else if (i != args.length-2) {
+ System.err.println(usage);
+ System.exit(-1);
+ } else {
+ commandPath = args[i];
+ filePath = args[++i];
+ }
+ }
+
+ CommandRunner cr = new CommandRunner();
+
+ cr.setCommand(commandPath);
+ cr.setInputStream(new java.io.FileInputStream(filePath));
+ cr.setStdErrorStream(System.err);
+ cr.setStdOutputStream(System.out);
+
+ cr.setTimeout(timeout);
+
+ cr.evaluate();
+
+ System.err.println("output value: "+cr.getExitValue());
+ }
+}
diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml
--exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar
nutch-cvs-20040703/src/plugin/build.xml nutch-cvs-20040703.xing/src/plugin/build.xml
--- nutch-cvs-20040703/src/plugin/build.xml 2004-06-30 15:15:51.000000000 -0700
+++ nutch-cvs-20040703.xing/src/plugin/build.xml 2004-07-03 18:55:56.000000000
-0700
@@ -13,6 +13,7 @@
<ant dir="parse-text" target="deploy"/>
<ant dir="parse-pdf" target="deploy"/>
<ant dir="parse-msword" target="deploy"/>
+ <ant dir="parse-ext" target="deploy"/>
<ant dir="index-basic" target="deploy"/>
<ant dir="creativecommons" target="deploy"/>
<ant dir="languageidentifier" target="deploy"/>
@@ -37,6 +38,7 @@
<ant dir="parse-text" target="clean"/>
<ant dir="parse-pdf" target="clean"/>
<ant dir="parse-msword" target="clean"/>
+ <ant dir="parse-ext" target="clean"/>
<ant dir="index-basic" target="clean"/>
<ant dir="creativecommons" target="clean"/>
<ant dir="languageidentifier" target="clean"/>
diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml
--exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar
nutch-cvs-20040703/src/plugin/parse-ext/build.xml
nutch-cvs-20040703.xing/src/plugin/parse-ext/build.xml
--- nutch-cvs-20040703/src/plugin/parse-ext/build.xml 1969-12-31 16:00:00.000000000
-0800
+++ nutch-cvs-20040703.xing/src/plugin/parse-ext/build.xml 2004-07-02
21:51:43.000000000 -0700
@@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+
+<project name="parse-ext" default="jar">
+
+ <import file="../build-plugin.xml"/>
+
+ <copy file="command" todir="${deploy.dir}" preservelastmodified="true"/>
+ <chmod file="${deploy.dir}/command" perm="755"/>
+
+</project>
diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml
--exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar
nutch-cvs-20040703/src/plugin/parse-ext/command
nutch-cvs-20040703.xing/src/plugin/parse-ext/command
--- nutch-cvs-20040703/src/plugin/parse-ext/command 1969-12-31 16:00:00.000000000
-0800
+++ nutch-cvs-20040703.xing/src/plugin/parse-ext/command 2004-07-02
14:41:14.000000000 -0700
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# Sample bash script as external command invoked by parse-ext plugin
+#
+# 20040701, John Xing
+
+set -e
+
+if [ $# -ne 1 ]; then
+ echo Usage:$0 mimeType >&2
+ exit 1
+fi
+
+case $1 in
+"application/vnd.nutch.example.cat")
+ cat
+ ;;
+"application/vnd.nutch.example.md5sum")
+ md5sum
+ ;;
+*)
+ echo "Can't parse mimeType $1" >&2
+ exit 1
+esac
diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml
--exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar
nutch-cvs-20040703/src/plugin/parse-ext/plugin.xml
nutch-cvs-20040703.xing/src/plugin/parse-ext/plugin.xml
--- nutch-cvs-20040703/src/plugin/parse-ext/plugin.xml 1969-12-31 16:00:00.000000000
-0800
+++ nutch-cvs-20040703.xing/src/plugin/parse-ext/plugin.xml 2004-07-02
14:41:35.000000000 -0700
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+ id="parse-ext"
+ name="External Parser Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <extension-point
+ id="net.nutch.parse.Parser"
+ name="Nutch Content Parser"/>
+
+ <runtime>
+ <library name="parse-ext.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <extension id="net.nutch.parse.ext"
+ name="ExtParse"
+ point="net.nutch.parse.Parser">
+
+ <implementation id="net.nutch.parse.ext.ExtParser"
+ class="net.nutch.parse.ext.ExtParser"
+ contentType="application/vnd.nutch.example.cat"
+ pathSuffix=""/>
+
+ <implementation id="net.nutch.parse.ext.ExtParser"
+ class="net.nutch.parse.ext.ExtParser"
+ contentType="application/vnd.nutch.example.md5sum"
+ pathSuffix=""/>
+
+ </extension>
+
+</plugin>
diff -Nur --exclude=crawl-urlfilter.txt --exclude=nutch-site.xml
--exclude=regex-urlfilter.txt --exclude=concurrent-1.3.4.jar
nutch-cvs-20040703/src/plugin/parse-ext/src/java/net/nutch/parse/ext/ExtParser.java
nutch-cvs-20040703.xing/src/plugin/parse-ext/src/java/net/nutch/parse/ext/ExtParser.java
---
nutch-cvs-20040703/src/plugin/parse-ext/src/java/net/nutch/parse/ext/ExtParser.java
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040703.xing/src/plugin/parse-ext/src/java/net/nutch/parse/ext/ExtParser.java
2004-07-03 22:52:48.000000000 -0700
@@ -0,0 +1,107 @@
+/* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+package net.nutch.parse.ext;
+
+import net.nutch.protocol.Content;
+import net.nutch.parse.Parser;
+import net.nutch.parse.Parse;
+import net.nutch.parse.ParseData;
+import net.nutch.parse.ParseImpl;
+import net.nutch.parse.Outlink;
+import net.nutch.parse.ParseException;
+
+import net.nutch.util.LogFormatter;
+import net.nutch.util.NutchConf;
+import net.nutch.util.CommandRunner;
+
+import java.util.Properties;
+import java.util.logging.Logger;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+
+/*********************************************
+ * A wrapper that invokes external command to do real parsing job.
+ *
+ * @author John Xing
+ *********************************************/
+
+public class ExtParser implements Parser {
+ public static final Logger LOG =
+ LogFormatter.getLogger("net.nutch.parse.ext");
+
+ private final int BUFFER_SIZE = 4096;
+
+ private final String COMMAND =
+ NutchConf.get("plugin.parse.ext.command",
+ "./build/plugins/parse-ext/command");
+
+ private final int TIMEOUT =
+ NutchConf.getInt("plugin.parse.ext.timeout",30);
+
+ public ExtParser () {}
+
+ public Parse getParse(Content content) throws ParseException {
+
+ String contentType = content.getContentType();
+
+ String text = null;
+ String title = null;
+
+ try {
+
+ byte[] raw = content.getContent();
+
+ String contentLength =
+ (String)content.getMetadata().get("Content-Length");
+ if (contentLength != null
+ && raw.length != Integer.parseInt(contentLength)) {
+ throw new ParseException("Content truncated at "+raw.length
+ +" bytes. Parser can't handle incomplete "+contentType+" file.");
+ }
+
+ ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
+ ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE/4);
+
+ CommandRunner cr = new CommandRunner();
+
+ cr.setCommand(COMMAND + " " +contentType);
+ cr.setInputStream(new ByteArrayInputStream(raw));
+ cr.setStdOutputStream(os);
+ cr.setStdErrorStream(es);
+
+ cr.setTimeout(TIMEOUT);
+
+ cr.evaluate();
+
+ if (cr.getExitValue() != 0)
+ throw new ParseException("External command "+COMMAND
+ +" failed with error: "+es.toString());
+
+ text = os.toString();
+
+ } catch (ParseException e) {
+ throw e;
+ } catch (Exception e) { // run time exception
+ throw new ParseException("ExtParser failed. "+e);
+ }
+
+ if (text == null)
+ text = "";
+
+ if (title == null)
+ title = "";
+
+ // collect outlink
+ Outlink[] outlinks = new Outlink[0];
+
+ // collect meta data
+ Properties metaData = new Properties();
+ metaData.putAll(content.getMetadata()); // copy through
+
+ ParseData parseData = new ParseData(title, outlinks, metaData);
+ return new ParseImpl(text, parseData);
+ }
+
+}
-------------------------------------------------------
This SF.Net email sponsored by Black Hat Briefings & Training.
Attend Black Hat Briefings & Training, Las Vegas July 24-29 -
digital self defense, top technical experts, no vendor pitches,
unmatched networking opportunities. Visit www.blackhat.com
_______________________________________________
Nutch-developers mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-developers