Hi, Doug,
Here is a patch for pdf & msword parsing.
Necessary jars are available at
http://nutch.neasys.com/patch/
Check for tag 20040614.
Also included is a simple ParserChecker.java,
handy for parser testing.
You will have to apply my previous patches first.
Cheers,
John
-------------------------- patch.txt.20040614 -----------------------------
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar'
nutch-cvs-20040605.patched.with.20040605.20040606/src/java/net/nutch/parse/ParserChecker.java
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/java/net/nutch/parse/ParserChecker.java
---
nutch-cvs-20040605.patched.with.20040605.20040606/src/java/net/nutch/parse/ParserChecker.java
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/java/net/nutch/parse/ParserChecker.java
2004-06-14 23:02:10.000000000 -0700
@@ -0,0 +1,78 @@
+package net.nutch.parse;
+
+import net.nutch.util.LogFormatter;
+
+import net.nutch.protocol.ProtocolFactory;
+import net.nutch.protocol.Protocol;
+import net.nutch.protocol.Content;
+
+import java.util.logging.Logger;
+
+/*********************************************
+ * Parser checker, useful for testing parser.
+ *
+ * @author John Xing
+ *********************************************/
+
+public class ParserChecker {
+
+ public static final Logger LOG =
+ LogFormatter.getLogger("net.nutch.parse.ParserChecker");
+
+ public ParserChecker() {}
+
+ public static void main(String[] args) throws Exception {
+ boolean dumpText = false;
+ boolean force = false;
+ String contentType = null;
+ String url = null;
+
+ String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url";
+
+ if (args.length == 0) {
+ System.err.println(usage);
+ System.exit(-1);
+ }
+
+ for (int i = 0; i < args.length; i++) {
+ if (args[i].equals("-forceAs")) {
+ force = true;
+ contentType = args[++i];
+ } else if (args[i].equals("-dumpText")) {
+ dumpText = true;
+ } else if (i != args.length-1) {
+ System.err.println(usage);
+ System.exit(-1);
+ } else {
+ url = args[i];
+ }
+ }
+
+ LOG.info("fetching: "+url);
+
+ Protocol protocol = ProtocolFactory.getProtocol(url);
+ Content content = protocol.getContent(url);
+
+ if (!force)
+ contentType = content.getContentType();
+
+ if (contentType == null) {
+ System.err.println("");
+ System.exit(-1);
+ }
+
+ LOG.info("parsing: "+url);
+
+ Parser parser = ParserFactory.getParser(contentType, url);
+ Parse parse = parser.getParse(content);
+
+ System.out.print("---------\nParseData\n---------\n");
+ System.out.print(parse.getData().toString());
+ if (dumpText) {
+ System.out.print("---------\nParseText\n---------\n");
+ System.out.print(parse.getText());
+ }
+
+ System.exit(0);
+ }
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar'
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/build.xml
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/build.xml
--- nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/build.xml
2004-06-14 16:47:26.000000000 -0700
+++ nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/build.xml
2004-06-14 15:45:57.000000000 -0700
@@ -11,6 +11,8 @@
<ant dir="protocol-http" target="deploy"/>
<ant dir="parse-html" target="deploy"/>
<ant dir="parse-text" target="deploy"/>
+ <ant dir="parse-pdf" target="deploy"/>
+ <ant dir="parse-msword" target="deploy"/>
<ant dir="creativecommons" target="deploy"/>
</target>
@@ -31,6 +33,8 @@
<ant dir="protocol-http" target="clean"/>
<ant dir="parse-html" target="clean"/>
<ant dir="parse-text" target="clean"/>
+ <ant dir="parse-pdf" target="clean"/>
+ <ant dir="parse-msword" target="clean"/>
</target>
</project>
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar'
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/build.xml
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/build.xml
---
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/build.xml
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/build.xml
2004-06-14 15:23:33.000000000 -0700
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="parse-msword" default="jar">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar'
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/plugin.xml
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/plugin.xml
---
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/plugin.xml
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/plugin.xml
2004-06-14 15:26:15.000000000 -0700
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+ id="parse-msword"
+ name="MSWord Parse Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <extension-point
+ id="net.nutch.parse.Parser"
+ name="Nutch Content Parser"/>
+
+ <!--
+ <extension-point
+ id="net.nutch.parse.MSWordParseFilter"
+ name="MSWord Parse Filter"/>
+ -->
+
+ <runtime>
+ <library name="parse-msword.jar">
+ <export name="*"/>
+ </library>
+ <library name="poi-2.1-20040508.jar"/>
+ <library name="poi-scratchpad-2.1-20040508.jar"/>
+ </runtime>
+
+ <extension id="net.nutch.parse.msword"
+ name="MSWordParse"
+ point="net.nutch.parse.Parser">
+
+ <implementation id="net.nutch.parse.msword.MSWordParser"
+ class="net.nutch.parse.msword.MSWordParser"
+ contentType="application/msword"
+ pathSuffix=""/>
+
+ </extension>
+
+</plugin>
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar'
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/chp/Word6CHPBinTable.java
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/chp/Word6CHPBinTable.java
---
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/chp/Word6CHPBinTable.java
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/chp/Word6CHPBinTable.java
2004-06-14 22:04:05.000000000 -0700
@@ -0,0 +1,77 @@
+/* Copyright 2004 Ryan Ackley
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package net.nutch.parse.msword.chp;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.io.OutputStream;
+import java.io.IOException;
+
+import org.apache.poi.poifs.common.POIFSConstants;
+import org.apache.poi.util.LittleEndian;
+import org.apache.poi.hwpf.model.io.*;
+import org.apache.poi.hwpf.model.*;
+
+/**
+ * This class holds all of the character formatting properties from a Word
+ * 6.0/95 document.
+ *
+ * @author Ryan Ackley
+ */
+public class Word6CHPBinTable
+{
+ /** List of character properties.*/
+ ArrayList _textRuns = new ArrayList();
+
+ /**
+ * Constructor used to read a binTable in from a Word document.
+ *
+ * @param documentStream The POIFS "WordDocument" stream from a Word document
+ * @param offset The offset of the Chp bin table in the main stream.
+ * @param size The size of the Chp bin table in the main stream.
+ * @param fcMin The start of text in the main stream.
+ */
+ public Word6CHPBinTable(byte[] documentStream, int offset,
+ int size, int fcMin)
+ {
+ PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
+
+ int length = binTable.length();
+ for (int x = 0; x < length; x++)
+ {
+ GenericPropertyNode node = binTable.getProperty(x);
+
+ int pageNum = LittleEndian.getShort((byte[])node.getBytes());
+ int pageOffset = POIFSConstants.BIG_BLOCK_SIZE * pageNum;
+
+ CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
+ pageOffset, fcMin);
+
+ int fkpSize = cfkp.size();
+
+ for (int y = 0; y < fkpSize; y++)
+ {
+ _textRuns.add(cfkp.getCHPX(y));
+ }
+ }
+ }
+
+ public List getTextRuns()
+ {
+ return _textRuns;
+ }
+
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar'
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/FastSavedException.java
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/FastSavedException.java
---
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/FastSavedException.java
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/FastSavedException.java
2004-06-14 15:43:36.000000000 -0700
@@ -0,0 +1,34 @@
+/* Copyright 2004 Ryan Ackley
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package net.nutch.parse.msword;
+
+/**
+ * <p>Title: </p>
+ * <p>Description: </p>
+ * <p>Copyright: Copyright (c) 2003</p>
+ * <p>Company: </p>
+ * @author not attributable
+ * @version 1.0
+ */
+
+public class FastSavedException extends Exception
+{
+ public FastSavedException(String msg)
+ {
+ super(msg);
+ }
+
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar'
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/MSWordParser.java
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/MSWordParser.java
---
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/MSWordParser.java
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/MSWordParser.java
2004-06-14 21:55:24.000000000 -0700
@@ -0,0 +1,103 @@
+/* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+package net.nutch.parse.msword;
+
+import net.nutch.protocol.Content;
+import net.nutch.util.LogFormatter;
+import net.nutch.parse.Parser;
+import net.nutch.parse.Parse;
+import net.nutch.parse.ParseData;
+import net.nutch.parse.ParseImpl;
+import net.nutch.parse.Outlink;
+import net.nutch.parse.ParseException;
+
+import java.util.Properties;
+//import java.util.logging.Logger;
+
+import java.io.ByteArrayInputStream;
+
+/*********************************************
+ * parser for mime type application/msword.
+ * It is based on org.apache.poi.*. We have to see how well it performs.
+ *
+ * @author John Xing
+ *
+ * Note on 20040614 by Xing:
+ * Some codes are stacked here for convenience (see inline comments).
+ * They may be moved to more appropriate places when new codebase
+ * stabilizes, especially after code for indexing is written.
+ *
+ *********************************************/
+
+public class MSWordParser implements Parser {
+// public static final Logger LOG =
+// LogFormatter.getLogger("net.nutch.parse.msword");
+
+ public MSWordParser () {}
+
+ public Parse getParse(Content content) throws ParseException {
+
+ // check that contentType is one we can handle
+ String contentType = content.getContentType();
+ if (contentType != null && !contentType.startsWith("application/msword"))
+ throw new ParseException(
+ "Content-Type not application/msword: "+contentType);
+
+ String text = null;
+ String title = null;
+
+ try {
+
+ byte[] raw = content.getContent();
+
+ String contentLength =
+ (String)content.getMetaData().get("Content-Length");
+ if (contentLength != null
+ && raw.length != Integer.parseInt(contentLength)) {
+ throw new ParseException("Content truncated at "+raw.length
+ +" bytes. Parser can't handle incomplete msword file.");
+ }
+
+ WordExtractor extractor = new WordExtractor();
+
+ // collect text
+ text = extractor.extractText(new ByteArrayInputStream(raw));
+
+ // collect meta info
+ // not yet
+
+ extractor = null;
+
+ } catch (ParseException e) {
+ throw e;
+ } catch (FastSavedException e) {
+ throw new ParseException(e);
+ } catch (PasswordProtectedException e) {
+ throw new ParseException(e);
+ } catch (Exception e) { // run time exception
+ throw new ParseException("Can't be handled as msword document. "+e);
+ } finally {
+ // nothing so far
+ }
+
+ if (text == null)
+ text = "";
+
+ if (title == null)
+ title = "";
+
+ // collect outlink
+ Outlink[] outlinks = new Outlink[0];
+
+ // collect meta data
+ Properties metaData = new Properties();
+ metaData.putAll(content.getMetaData()); // copy through
+
+ ParseData parseData = new ParseData(title, outlinks, metaData);
+ return new ParseImpl(text, parseData);
+ // any filter?
+ //return HtmlParseFilters.filter(content, parse, root);
+ }
+
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar'
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/PasswordProtectedException.java
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/PasswordProtectedException.java
---
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/PasswordProtectedException.java
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/PasswordProtectedException.java
2004-06-14 15:47:11.000000000 -0700
@@ -0,0 +1,25 @@
+/* Copyright 2004 Ryan Ackley
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package net.nutch.parse.msword;
+
+public class PasswordProtectedException
+ extends Exception
+{
+ public PasswordProtectedException(String msg)
+ {
+ super(msg);
+ }
+
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar'
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/Test.java
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/Test.java
---
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/Test.java
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/Test.java
2004-06-14 15:44:05.000000000 -0700
@@ -0,0 +1,37 @@
+package net.nutch.parse.msword;
+
+import java.io.*;
+/**
+ * Title:
+ * Description:
+ * Copyright: Copyright (c) 2003
+ * Company:
+ * @author
+ * @version 1.0
+ */
+
+class Test
+{
+
+ public Test()
+ {
+ }
+ public static void main(String[] args)
+ {
+ try
+ {
+ WordExtractor extractor = new WordExtractor();
+ String s = extractor.extractText(new FileInputStream(args[0]));
+ System.out.println(s);
+ //OutputStreamWriter out = new OutputStreamWriter(new
FileOutputStream("C:\\test.txt"), "UTF-16LE");
+ OutputStreamWriter out = new OutputStreamWriter(new
FileOutputStream("./test.txt"));
+ out.write(s);
+ out.flush();
+ out.close();
+ }
+ catch (Exception e)
+ {
+ e.printStackTrace();
+ }
+ }
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar'
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/Word6Extractor.java
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/Word6Extractor.java
---
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/Word6Extractor.java
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/Word6Extractor.java
2004-06-14 22:07:53.000000000 -0700
@@ -0,0 +1,231 @@
+/* Copyright 2004 Ryan Ackley
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package net.nutch.parse.msword;
+
+import net.nutch.parse.msword.chp.*;
+
+import org.apache.poi.util.LittleEndian;
+import org.apache.poi.hwpf.model.*;
+import org.apache.poi.hwpf.sprm.*;
+
+import java.util.*;
+import java.io.*;
+
+/**
+ * This class is used to extract text from Word 6 documents only. It should
+ * only be called from the org.textmining.text.extraction.WordExtractor because
+ * it will automatically determine the version.
+ *
+ * @author Ryan Ackley
+ */
+class Word6Extractor
+{
+
+ public Word6Extractor()
+ {
+ }
+
+ /**
+ * Extracts the text
+ *
+ * @param mainStream The POIFS document stream entitled "WordDocument".
+ *
+ * @return The text from the document
+ * @throws Exception If there are any unexpected exceptions.
+ */
+ public String extractText(byte[] mainStream) throws Exception
+ {
+ int fcMin = LittleEndian.getInt(mainStream, 0x18);
+ int fcMax = LittleEndian.getInt(mainStream, 0x1C);
+
+ int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8);
+ int chpTableSize = LittleEndian.getInt(mainStream, 0xbc);
+
+ // get a list of character properties
+ Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset,
+ chpTableSize, fcMin);
+ List textRuns = chpTable.getTextRuns();
+
+ // iterate through the
+ WordTextBuffer finalTextBuf = new WordTextBuffer();
+ Iterator runsIt = textRuns.iterator();
+ while(runsIt.hasNext())
+ {
+ CHPX chpx = (CHPX)runsIt.next();
+ int runStart = chpx.getStart() + fcMin;
+ int runEnd = chpx.getEnd() + fcMin;
+
+ if (!isDeleted(chpx.getGrpprl()))
+ {
+ String s = new String(mainStream, runStart, Math.min(runEnd, fcMax) -
runStart, "Cp1252");
+ finalTextBuf.append(s);
+ if (runEnd >= fcMax)
+ {
+ break;
+ }
+ }
+ }
+
+ return finalTextBuf.toString();
+ }
+
+ /**
+ * Used to determine if a run of text has been deleted.
+ * @param grpprl The list of sprms for this run of text.
+ * @return
+ */
+ private boolean isDeleted(byte[] grpprl)
+ {
+ int offset = 0;
+ boolean deleted = false;
+ while (offset < grpprl.length)
+ {
+ switch (LittleEndian.getUnsignedByte(grpprl, offset++))
+ {
+ case 65:
+ deleted = grpprl[offset++] != 0;
+ break;
+ case 66:
+ offset++;
+ break;
+ case 67:
+ offset++;
+ break;
+ case 68:
+ offset += grpprl[offset];
+ break;
+ case 69:
+ offset += 2;
+ break;
+ case 70:
+ offset += 4;
+ break;
+ case 71:
+ offset++;
+ break;
+ case 72:
+ offset += 2;
+ break;
+ case 73:
+ offset += 3;
+ break;
+ case 74:
+ offset += grpprl[offset];
+ break;
+ case 75:
+ offset++;
+ break;
+ case 80:
+ offset += 2;
+ break;
+ case 81:
+ offset += grpprl[offset];
+ break;
+ case 82:
+ offset += grpprl[offset];
+ break;
+ case 83:
+ break;
+ case 85:
+ offset++;
+ break;
+ case 86:
+ offset++;
+ break;
+ case 87:
+ offset++;
+ break;
+ case 88:
+ offset++;
+ break;
+ case 89:
+ offset++;
+ break;
+ case 90:
+ offset++;
+ break;
+ case 91:
+ offset++;
+ break;
+ case 92:
+ offset++;
+ break;
+ case 93:
+ offset += 2;
+ break;
+ case 94:
+ offset++;
+ break;
+ case 95:
+ offset += 3;
+ break;
+ case 96:
+ offset += 2;
+ break;
+ case 97:
+ offset += 2;
+ break;
+ case 98:
+ offset++;
+ break;
+ case 99:
+ offset++;
+ break;
+ case 100:
+ offset++;
+ break;
+ case 101:
+ offset++;
+ break;
+ case 102:
+ offset++;
+ break;
+ case 103:
+ offset += grpprl[offset];
+ break;
+ case 104:
+ offset++;
+ break;
+ case 105:
+ offset += grpprl[offset];
+ break;
+ case 106:
+ offset += grpprl[offset];
+ break;
+ case 107:
+ offset += 2;
+ break;
+ case 108:
+ offset += grpprl[offset];
+ break;
+ case 109:
+ offset += 2;
+ break;
+ case 110:
+ offset += 2;
+ break;
+ case 117:
+ offset++;
+ break;
+ case 118:
+ offset++;
+ break;
+
+ }
+ }
+ return deleted;
+ }
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar'
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordExtractor.java
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordExtractor.java
---
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordExtractor.java
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordExtractor.java
2004-06-14 22:08:23.000000000 -0700
@@ -0,0 +1,217 @@
+/* Copyright 2004 Ryan Ackley
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package net.nutch.parse.msword;
+
+import org.apache.poi.poifs.filesystem.*;
+import org.apache.poi.util.LittleEndian;
+import org.apache.poi.hwpf.model.*;
+import org.apache.poi.hwpf.sprm.*;
+
+import java.util.*;
+import java.io.*;
+
+/**
+ * This class extracts the text from a Word 6.0/95/97/2000/XP word doc
+ *
+ * @author Ryan Ackley
+ */
+public class WordExtractor
+{
+
+ /**
+ * Constructor
+ */
+ public WordExtractor()
+ {
+ }
+
+ /**
+ * Gets the text from a Word document.
+ *
+ * @param in The InputStream representing the Word file.
+ */
+ public String extractText(InputStream in) throws Exception
+ {
+ ArrayList text = new ArrayList();
+ POIFSFileSystem fsys = new POIFSFileSystem(in);
+
+ // load our POIFS document streams.
+ DocumentEntry headerProps =
+ (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
+ DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
+ byte[] header = new byte[headerProps.getSize()];
+
+
+ din.read(header);
+ din.close();
+
+ int info = LittleEndian.getShort(header, 0xa);
+ if ((info & 0x4) != 0)
+ {
+ throw new FastSavedException("Fast-saved files are unsupported at this time");
+ }
+ if ((info & 0x100) != 0)
+ {
+ throw new PasswordProtectedException("This document is password protected");
+ }
+
+ // determine the version of Word this document came from.
+ int nFib = LittleEndian.getShort(header, 0x2);
+ switch (nFib)
+ {
+ case 101:
+ case 102:
+ case 103:
+ case 104:
+ // this is a Word 6.0 doc send it to the extractor for that version.
+ Word6Extractor oldExtractor = new Word6Extractor();
+ return oldExtractor.extractText(header);
+ }
+
+ //Get the information we need from the header
+ boolean useTable1 = (info & 0x200) != 0;
+
+ //get the location of the piece table
+ int complexOffset = LittleEndian.getInt(header, 0x1a2);
+
+ // determine which table stream we must use.
+ String tableName = null;
+ if (useTable1)
+ {
+ tableName = "1Table";
+ }
+ else
+ {
+ tableName = "0Table";
+ }
+
+ DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
+ byte[] tableStream = new byte[table.getSize()];
+
+ din = fsys.createDocumentInputStream(tableName);
+
+ din.read(tableStream);
+ din.close();
+
+ int chpOffset = LittleEndian.getInt(header, 0xfa);
+ int chpSize = LittleEndian.getInt(header, 0xfe);
+ int fcMin = LittleEndian.getInt(header, 0x18);
+ CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);
+
+ // load our text pieces and our character runs
+ ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset,
fcMin);
+ TextPieceTable tpt = cft.getTextPieceTable();
+ List textPieces = tpt.getTextPieces();
+
+ // make the POIFS objects available for garbage collection
+ din = null;
+ fsys = null;
+ table = null;
+ headerProps = null;
+
+ List textRuns = cbt.getTextRuns();
+ Iterator runIt = textRuns.iterator();
+ Iterator textIt = textPieces.iterator();
+
+ TextPiece currentPiece = (TextPiece)textIt.next();
+ int currentTextStart = currentPiece.getStart();
+ int currentTextEnd = currentPiece.getEnd();
+
+ WordTextBuffer finalTextBuf = new WordTextBuffer();
+
+ // iterate through all text runs extract the text only if they haven't been
+ // deleted
+ while (runIt.hasNext())
+ {
+ CHPX chpx = (CHPX)runIt.next();
+ boolean deleted = isDeleted(chpx.getGrpprl());
+ if (deleted)
+ {
+ continue;
+ }
+
+ int runStart = chpx.getStart();
+ int runEnd = chpx.getEnd();
+
+ while (runStart >= currentTextEnd)
+ {
+ currentPiece = (TextPiece) textIt.next ();
+ currentTextStart = currentPiece.getStart ();
+ currentTextEnd = currentPiece.getEnd ();
+ }
+
+ if (runEnd < currentTextEnd)
+ {
+ String str = currentPiece.substring(runStart - currentTextStart, runEnd -
currentTextStart);
+ finalTextBuf.append(str);
+ }
+ else if (runEnd > currentTextEnd)
+ {
+ while (runEnd > currentTextEnd)
+ {
+ String str = currentPiece.substring(runStart - currentTextStart,
+ currentTextEnd - currentTextStart);
+ finalTextBuf.append(str);
+ if (textIt.hasNext())
+ {
+ currentPiece = (TextPiece) textIt.next ();
+ currentTextStart = currentPiece.getStart ();
+ runStart = currentTextStart;
+ currentTextEnd = currentPiece.getEnd ();
+ }
+ else
+ {
+ return finalTextBuf.toString();
+ }
+ }
+ String str = currentPiece.substring(0, runEnd - currentTextStart);
+ finalTextBuf.append(str);
+ }
+ else
+ {
+ String str = currentPiece.substring(runStart - currentTextStart, runEnd -
currentTextStart);
+ if (textIt.hasNext())
+ {
+ currentPiece = (TextPiece) textIt.next();
+ currentTextStart = currentPiece.getStart();
+ currentTextEnd = currentPiece.getEnd();
+ }
+ finalTextBuf.append(str);
+ }
+ }
+ return finalTextBuf.toString();
+ }
+
+ /**
+ * Used to determine if a run of text has been deleted.
+ *
+ * @param grpprl The list of sprms for a particular run of text.
+ * @return true if this run of text has been deleted.
+ */
+ private boolean isDeleted(byte[] grpprl)
+ {
+ SprmIterator iterator = new SprmIterator(grpprl,0);
+ while (iterator.hasNext())
+ {
+ SprmOperation op = iterator.next();
+ // 0 is the operation that signals a FDelRMark operation
+ if (op.getOperation() == 0 && op.getOperand() != 0)
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar'
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordTextBuffer.java
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordTextBuffer.java
---
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordTextBuffer.java
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordTextBuffer.java
2004-06-14 15:44:59.000000000 -0700
@@ -0,0 +1,66 @@
+/* Copyright 2004 Ryan Ackley
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package net.nutch.parse.msword;
+
+
+/**
+ * This class acts as a StringBuffer for text from a word document. It allows
+ * processing of character before they
+ * @author Ryan Ackley
+ * @version 1.0
+ */
+public class WordTextBuffer
+{
+ StringBuffer _buf;
+ boolean _hold;
+
+ public WordTextBuffer()
+ {
+ _buf = new StringBuffer();
+ _hold = false;
+ }
+
+ public void append(String text)
+ {
+ char[] letters = text.toCharArray();
+ for (int x = 0; x < letters.length; x++)
+ {
+ switch(letters[x])
+ {
+ case '\r':
+ _buf.append("\r\n");
+ break;
+ case 0x13:
+ _hold = true;
+ break;
+ case 0x14:
+ _hold = false;
+ break;
+ default:
+ if (!_hold)
+ {
+ _buf.append(letters[x]);
+ }
+ break;
+ }
+ }
+ }
+
+ public String toString()
+ {
+ return _buf.toString();
+ }
+
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar'
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordTextPiece.java
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordTextPiece.java
---
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordTextPiece.java
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordTextPiece.java
2004-06-14 15:45:15.000000000 -0700
@@ -0,0 +1,54 @@
+/* Copyright 2004 Ryan Ackley
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package net.nutch.parse.msword;
+
+/**
+ * This class stores info about the data structure describing a chunk of text
+ * in a Word document. Specifically, whether or not a Range of text uses
+ * unicode or Cp1252 encoding.
+ *
+ * @author Ryan Ackley
+ */
+
+class WordTextPiece
+{
+ private int _fcStart;
+ private boolean _usesUnicode;
+ private int _length;
+
+ public WordTextPiece(int start, int length, boolean unicode)
+ {
+ _usesUnicode = unicode;
+ _length = length;
+ _fcStart = start;
+ }
+ public boolean usesUnicode()
+ {
+ return _usesUnicode;
+ }
+
+ public int getStart()
+ {
+ return _fcStart;
+ }
+ public int getLength()
+ {
+ return _length;
+ }
+
+
+
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar'
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-pdf/build.xml
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-pdf/build.xml
--- nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-pdf/build.xml
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-pdf/build.xml
2004-06-13 22:22:11.000000000 -0700
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="parse-pdf" default="jar">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar'
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-pdf/plugin.xml
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-pdf/plugin.xml
--- nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-pdf/plugin.xml
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-pdf/plugin.xml
2004-06-14 11:03:46.000000000 -0700
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+ id="parse-pdf"
+ name="Pdf Parse Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <extension-point
+ id="net.nutch.parse.Parser"
+ name="Nutch Content Parser"/>
+
+ <!--
+ <extension-point
+ id="net.nutch.parse.PdfParseFilter"
+ name="PDF Parse Filter"/>
+ -->
+
+ <runtime>
+ <library name="parse-pdf.jar">
+ <export name="*"/>
+ </library>
+ <library name="PDFBox-0.6.5.jar"/>
+ <library name="log4j.jar"/>
+ </runtime>
+
+ <extension id="net.nutch.parse.pdf"
+ name="PdfParse"
+ point="net.nutch.parse.Parser">
+
+ <implementation id="net.nutch.parse.pdf.PdfParser"
+ class="net.nutch.parse.pdf.PdfParser"
+ contentType="application/pdf"
+ pathSuffix=""/>
+
+ </extension>
+
+</plugin>
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar'
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-pdf/src/java/net/nutch/parse/pdf/PdfParser.java
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-pdf/src/java/net/nutch/parse/pdf/PdfParser.java
---
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-pdf/src/java/net/nutch/parse/pdf/PdfParser.java
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-pdf/src/java/net/nutch/parse/pdf/PdfParser.java
2004-06-14 14:52:14.000000000 -0700
@@ -0,0 +1,172 @@
+/* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+package net.nutch.parse.pdf;
+
+import org.pdfbox.encryption.DecryptDocument;
+import org.pdfbox.pdfparser.PDFParser;
+import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.pdmodel.PDDocumentInformation;
+import org.pdfbox.util.PDFTextStripper;
+
+import org.pdfbox.exceptions.CryptographyException;
+import org.pdfbox.exceptions.InvalidPasswordException;
+
+import net.nutch.protocol.Content;
+import net.nutch.util.LogFormatter;
+import net.nutch.parse.Parser;
+import net.nutch.parse.Parse;
+import net.nutch.parse.ParseData;
+import net.nutch.parse.ParseImpl;
+import net.nutch.parse.Outlink;
+import net.nutch.parse.ParseException;
+
+import java.text.SimpleDateFormat;
+import java.util.Calendar;
+
+import java.util.Properties;
+import java.util.logging.Logger;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+
+/*********************************************
+ * parser for mime type application/pdf.
+ * It is based on org.pdfbox.*. We have to see how well it does the job.
+ *
+ * @author John Xing
+ *
+ * Note on 20040614 by Xing:
+ * Some codes are stacked here for convenience (see inline comments).
+ * They may be moved to more appropriate places when new codebase
+ * stabilizes, especially after code for indexing is written.
+ *
+ *********************************************/
+
+public class PdfParser implements Parser {
+ public static final Logger LOG =
+ LogFormatter.getLogger("net.nutch.parse.pdf");
+
+ public PdfParser () {
+ // redirect org.apache.log4j.Logger to java's native logger, in order
+ // to, at least, suppress annoying log4j warnings.
+ // Note on 20040614 by Xing:
+ // log4j is used by pdfbox. This snippet'd better be moved
+ // to a common place shared by all parsers that use log4j.
+ org.apache.log4j.Logger rootLogger =
+ org.apache.log4j.Logger.getRootLogger();
+
+ rootLogger.setLevel(org.apache.log4j.Level.INFO);
+
+ org.apache.log4j.Appender appender = new org.apache.log4j.WriterAppender(
+ new org.apache.log4j.SimpleLayout(),
+ net.nutch.util.LogFormatter.getLogStream(
+ this.LOG, java.util.logging.Level.INFO));
+
+ rootLogger.addAppender(appender);
+ }
+
+ public Parse getParse(Content content) throws ParseException {
+
+ // check that contentType is one we can handle
+ String contentType = content.getContentType();
+ if (contentType != null && !contentType.startsWith("application/pdf"))
+ throw new ParseException(
+ "Content-Type not application/pdf: "+contentType);
+
+ // in memory representation of pdf file
+ PDDocument pdf = null;
+
+ String text = null;
+ String title = null;
+
+ try {
+
+ byte[] raw = content.getContent();
+
+ String contentLength =
+ (String)content.getMetaData().get("Content-Length");
+ if (contentLength != null
+ && raw.length != Integer.parseInt(contentLength)) {
+ throw new ParseException("Content truncated at "+raw.length
+ +" bytes. Parser can't handle incomplete pdf file.");
+ }
+
+ PDFParser parser = new PDFParser(
+ new ByteArrayInputStream(raw));
+ parser.parse();
+
+ pdf = parser.getPDDocument();
+
+ if (pdf.isEncrypted()) {
+ DecryptDocument decryptor = new DecryptDocument(pdf);
+ //Just try using the default password and move on
+ decryptor.decryptDocument("");
+ }
+
+ // collect text
+ PDFTextStripper stripper = new PDFTextStripper();
+ text = stripper.getText(pdf);
+
+ // collect title
+ PDDocumentInformation info = pdf.getDocumentInformation();
+ title = info.getTitle();
+ // more useful info, currently not used. please keep them for future use.
+ // pdf.getPageCount();
+ // info.getAuthor()
+ // info.getSubject()
+ // info.getKeywords()
+ // info.getCreator()
+ // info.getProducer()
+ // info.getTrapped()
+ // formatDate(info.getCreationDate())
+ // formatDate(info.getModificationDate())
+
+ } catch (ParseException e) {
+ throw e;
+ } catch (CryptographyException e) {
+ throw new ParseException("Error decrypting document. "+e);
+ } catch (InvalidPasswordException e) {
+ throw new ParseException("Can't decrypt document. "+e);
+ } catch (Exception e) { // run time exception
+ throw new ParseException("Can't be handled as pdf document. "+e);
+ } finally {
+ try {
+ if (pdf != null)
+ pdf.close();
+ } catch (IOException e) {
+ // nothing to do
+ }
+ }
+
+ if (text == null)
+ text = "";
+
+ if (title == null)
+ title = "";
+
+ // collect outlink
+ Outlink[] outlinks = new Outlink[0];
+
+ // collect meta data
+ Properties metaData = new Properties();
+ metaData.putAll(content.getMetaData()); // copy through
+
+ ParseData parseData = new ParseData(title, outlinks, metaData);
+ return new ParseImpl(text, parseData);
+ // any filter?
+ //return HtmlParseFilters.filter(content, parse, root);
+ }
+
+ // format date
+ // currently not used. please keep it for future use.
+ private String formatDate(Calendar date) {
+ String retval = null;
+ if(date != null) {
+ SimpleDateFormat formatter = new SimpleDateFormat();
+ retval = formatter.format(date.getTime());
+ }
+ return retval;
+ }
+
+}
__________________________________________
http://www.neasys.com - A Good Place to Be
Come to visit us today!
-------------------------------------------------------
This SF.Net email is sponsored by The 2004 JavaOne(SM) Conference
Learn from the experts at JavaOne(SM), Sun's Worldwide Java Developer
Conference, June 28 - July 1 at the Moscone Center in San Francisco, CA
REGISTER AND SAVE! http://java.sun.com/javaone/sf Priority Code NWMGYKND
_______________________________________________
Nutch-developers mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-developers