[Nutch-dev] patch for pdf & msword parsing

john Mon, 14 Jun 2004 23:46:03 -0700

Hi, Doug,

Here is a patch for pdf & msword parsing.
Necessary jars are available at
http://nutch.neasys.com/patch/
Check for tag 20040614.
Also included is a simple ParserChecker.java,
handy for parser testing.
You will have to apply my previous patches first.


Cheers,

John

-------------------------- patch.txt.20040614 -----------------------------
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605.patched.with.20040605.20040606/src/java/net/nutch/parse/ParserChecker.java
 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/java/net/nutch/parse/ParserChecker.java
--- 
nutch-cvs-20040605.patched.with.20040605.20040606/src/java/net/nutch/parse/ParserChecker.java
       1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/java/net/nutch/parse/ParserChecker.java
  2004-06-14 23:02:10.000000000 -0700
@@ -0,0 +1,78 @@
+package net.nutch.parse;
+
+import net.nutch.util.LogFormatter;
+
+import net.nutch.protocol.ProtocolFactory;
+import net.nutch.protocol.Protocol;
+import net.nutch.protocol.Content;
+
+import java.util.logging.Logger;
+
+/*********************************************
+ * Parser checker, useful for testing parser.
+ * 
+ * @author John Xing
+ *********************************************/
+
+public class ParserChecker {
+
+  public static final Logger LOG =
+    LogFormatter.getLogger("net.nutch.parse.ParserChecker");
+
+  public ParserChecker() {}
+
+  public static void main(String[] args) throws Exception {
+    boolean dumpText = false;
+    boolean force = false;
+    String contentType = null;
+    String url = null;
+
+    String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-forceAs")) {
+        force = true;
+        contentType = args[++i];
+      } else if (args[i].equals("-dumpText")) {
+        dumpText = true;
+      } else if (i != args.length-1) {
+        System.err.println(usage);
+        System.exit(-1);
+      } else {
+        url = args[i];
+      }
+    }
+
+    LOG.info("fetching: "+url);
+
+    Protocol protocol = ProtocolFactory.getProtocol(url);
+    Content content = protocol.getContent(url);
+
+    if (!force)
+      contentType = content.getContentType();
+
+    if (contentType == null) {
+      System.err.println("");
+      System.exit(-1);
+    }
+
+    LOG.info("parsing: "+url);
+
+    Parser parser = ParserFactory.getParser(contentType, url);
+    Parse parse = parser.getParse(content);
+
+    System.out.print("---------\nParseData\n---------\n");
+    System.out.print(parse.getData().toString());
+    if (dumpText) {
+      System.out.print("---------\nParseText\n---------\n");
+      System.out.print(parse.getText());
+    }
+
+    System.exit(0);
+  }
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/build.xml 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/build.xml
--- nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/build.xml      
2004-06-14 16:47:26.000000000 -0700
+++ nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/build.xml 
2004-06-14 15:45:57.000000000 -0700
@@ -11,6 +11,8 @@
     <ant dir="protocol-http" target="deploy"/>
     <ant dir="parse-html" target="deploy"/>
     <ant dir="parse-text" target="deploy"/>
+    <ant dir="parse-pdf" target="deploy"/>
+    <ant dir="parse-msword" target="deploy"/>
     <ant dir="creativecommons" target="deploy"/>
   </target>
 
@@ -31,6 +33,8 @@
     <ant dir="protocol-http" target="clean"/>
     <ant dir="parse-html" target="clean"/>
     <ant dir="parse-text" target="clean"/>
+    <ant dir="parse-pdf" target="clean"/>
+    <ant dir="parse-msword" target="clean"/>
   </target>
 
 </project>
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/build.xml 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/build.xml
--- 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/build.xml 
1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/build.xml
    2004-06-14 15:23:33.000000000 -0700
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="parse-msword" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+</project>
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/plugin.xml 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/plugin.xml
--- 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/plugin.xml   
     1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/plugin.xml
   2004-06-14 15:26:15.000000000 -0700
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-msword"
+   name="MSWord Parse Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <extension-point
+      id="net.nutch.parse.Parser"
+      name="Nutch Content Parser"/>
+
+   <!--
+   <extension-point
+      id="net.nutch.parse.MSWordParseFilter"
+      name="MSWord Parse Filter"/>
+   -->
+
+   <runtime>
+      <library name="parse-msword.jar">
+         <export name="*"/>
+      </library>
+      <library name="poi-2.1-20040508.jar"/>
+      <library name="poi-scratchpad-2.1-20040508.jar"/>
+   </runtime>
+
+   <extension id="net.nutch.parse.msword"
+              name="MSWordParse"
+              point="net.nutch.parse.Parser">
+
+      <implementation id="net.nutch.parse.msword.MSWordParser"
+                      class="net.nutch.parse.msword.MSWordParser"
+                      contentType="application/msword"
+                      pathSuffix=""/>
+
+   </extension>
+
+</plugin>
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/chp/Word6CHPBinTable.java
 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/chp/Word6CHPBinTable.java
--- 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/chp/Word6CHPBinTable.java
 1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/chp/Word6CHPBinTable.java
    2004-06-14 22:04:05.000000000 -0700
@@ -0,0 +1,77 @@
+/*  Copyright 2004 Ryan Ackley
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package net.nutch.parse.msword.chp;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.io.OutputStream;
+import java.io.IOException;
+
+import org.apache.poi.poifs.common.POIFSConstants;
+import org.apache.poi.util.LittleEndian;
+import org.apache.poi.hwpf.model.io.*;
+import org.apache.poi.hwpf.model.*;
+
+/**
+ * This class holds all of the character formatting properties from a Word
+ * 6.0/95 document.
+ *
+ * @author Ryan Ackley
+ */
+public class Word6CHPBinTable
+{
+  /** List of character properties.*/
+  ArrayList _textRuns = new ArrayList();
+
+  /**
+   * Constructor used to read a binTable in from a Word document.
+   *
+   * @param documentStream The POIFS "WordDocument" stream from a Word document
+   * @param offset The offset of the Chp bin table in the main stream.
+   * @param size The size of the Chp bin table in the main stream.
+   * @param fcMin The start of text in the main stream.
+   */
+  public Word6CHPBinTable(byte[] documentStream, int offset,
+                     int size, int fcMin)
+  {
+    PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
+
+    int length = binTable.length();
+    for (int x = 0; x < length; x++)
+    {
+      GenericPropertyNode node = binTable.getProperty(x);
+
+      int pageNum = LittleEndian.getShort((byte[])node.getBytes());
+      int pageOffset = POIFSConstants.BIG_BLOCK_SIZE * pageNum;
+
+      CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
+        pageOffset, fcMin);
+
+      int fkpSize = cfkp.size();
+
+      for (int y = 0; y < fkpSize; y++)
+      {
+        _textRuns.add(cfkp.getCHPX(y));
+      }
+    }
+  }
+
+  public List getTextRuns()
+  {
+    return _textRuns;
+  }
+
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/FastSavedException.java
 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/FastSavedException.java
--- 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/FastSavedException.java
   1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/FastSavedException.java
      2004-06-14 15:43:36.000000000 -0700
@@ -0,0 +1,34 @@
+/*  Copyright 2004 Ryan Ackley
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package net.nutch.parse.msword;
+
+/**
+ * <p>Title: </p>
+ * <p>Description: </p>
+ * <p>Copyright: Copyright (c) 2003</p>
+ * <p>Company: </p>
+ * @author not attributable
+ * @version 1.0
+ */
+
+public class FastSavedException extends Exception
+{
+  public FastSavedException(String msg)
+  {
+    super(msg);
+  }
+
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/MSWordParser.java
 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/MSWordParser.java
--- 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/MSWordParser.java
 1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/MSWordParser.java
    2004-06-14 21:55:24.000000000 -0700
@@ -0,0 +1,103 @@
+/* Copyright (c) 2004 The Nutch Organization.  All rights reserved.   */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+package net.nutch.parse.msword;
+
+import net.nutch.protocol.Content;
+import net.nutch.util.LogFormatter;
+import net.nutch.parse.Parser;
+import net.nutch.parse.Parse;
+import net.nutch.parse.ParseData;
+import net.nutch.parse.ParseImpl;
+import net.nutch.parse.Outlink;
+import net.nutch.parse.ParseException;
+
+import java.util.Properties;
+//import java.util.logging.Logger;
+
+import java.io.ByteArrayInputStream;
+
+/*********************************************
+ * parser for mime type application/msword.
+ * It is based on org.apache.poi.*. We have to see how well it performs.
+ * 
+ * @author John Xing
+ *
+ * Note on 20040614 by Xing:
+ * Some codes are stacked here for convenience (see inline comments).
+ * They may be moved to more appropriate places when new codebase
+ * stabilizes, especially after code for indexing is written.
+ *
+ *********************************************/
+
+public class MSWordParser implements Parser {
+//  public static final Logger LOG =
+//    LogFormatter.getLogger("net.nutch.parse.msword");
+
+  public MSWordParser () {}
+
+  public Parse getParse(Content content) throws ParseException {
+
+    // check that contentType is one we can handle
+    String contentType = content.getContentType();
+    if (contentType != null && !contentType.startsWith("application/msword"))
+      throw new ParseException(
+        "Content-Type not application/msword: "+contentType);
+
+    String text = null;
+    String title = null;
+
+    try {
+
+      byte[] raw = content.getContent();
+
+      String contentLength =
+        (String)content.getMetaData().get("Content-Length");
+      if (contentLength != null
+            && raw.length != Integer.parseInt(contentLength)) {
+          throw new ParseException("Content truncated at "+raw.length
+            +" bytes. Parser can't handle incomplete msword file.");
+      }
+
+      WordExtractor extractor = new WordExtractor();
+
+      // collect text
+      text = extractor.extractText(new ByteArrayInputStream(raw));
+
+      // collect meta info
+      // not yet
+
+      extractor = null;
+
+    } catch (ParseException e) {
+      throw e;
+    } catch (FastSavedException e) {
+      throw new ParseException(e);
+    } catch (PasswordProtectedException e) {
+      throw new ParseException(e);
+    } catch (Exception e) { // run time exception
+      throw new ParseException("Can't be handled as msword document. "+e);
+    } finally {
+      // nothing so far
+    }
+
+    if (text == null)
+      text = "";
+
+    if (title == null)
+      title = "";
+
+    // collect outlink
+    Outlink[] outlinks = new Outlink[0];
+
+    // collect meta data
+    Properties metaData = new Properties();
+    metaData.putAll(content.getMetaData()); // copy through
+
+    ParseData parseData = new ParseData(title, outlinks, metaData);
+    return new ParseImpl(text, parseData);
+    // any filter?
+    //return HtmlParseFilters.filter(content, parse, root);
+  }
+
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/PasswordProtectedException.java
 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/PasswordProtectedException.java
--- 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/PasswordProtectedException.java
   1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/PasswordProtectedException.java
      2004-06-14 15:47:11.000000000 -0700
@@ -0,0 +1,25 @@
+/*  Copyright 2004 Ryan Ackley
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package net.nutch.parse.msword;
+
+public class PasswordProtectedException
+  extends Exception
+{
+  public PasswordProtectedException(String msg)
+  {
+    super(msg);
+  }
+
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/Test.java
 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/Test.java
--- 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/Test.java
 1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/Test.java
    2004-06-14 15:44:05.000000000 -0700
@@ -0,0 +1,37 @@
+package net.nutch.parse.msword;
+
+import java.io.*;
+/**
+ * Title:
+ * Description:
+ * Copyright:    Copyright (c) 2003
+ * Company:
+ * @author
+ * @version 1.0
+ */
+
+class Test
+{
+
+  public Test()
+  {
+  }
+  public static void main(String[] args)
+  {
+    try
+    {
+      WordExtractor extractor = new WordExtractor();
+      String s = extractor.extractText(new FileInputStream(args[0]));
+      System.out.println(s);
+      //OutputStreamWriter out = new OutputStreamWriter(new 
FileOutputStream("C:\\test.txt"), "UTF-16LE");
+      OutputStreamWriter out = new OutputStreamWriter(new 
FileOutputStream("./test.txt"));
+      out.write(s);
+      out.flush();
+      out.close();
+    }
+    catch (Exception e)
+    {
+      e.printStackTrace();
+    }
+  }
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/Word6Extractor.java
 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/Word6Extractor.java
--- 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/Word6Extractor.java
       1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/Word6Extractor.java
  2004-06-14 22:07:53.000000000 -0700
@@ -0,0 +1,231 @@
+/*  Copyright 2004 Ryan Ackley
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package net.nutch.parse.msword;
+
+import net.nutch.parse.msword.chp.*;
+
+import org.apache.poi.util.LittleEndian;
+import org.apache.poi.hwpf.model.*;
+import org.apache.poi.hwpf.sprm.*;
+
+import java.util.*;
+import java.io.*;
+
+/**
+ * This class is used to extract text from Word 6 documents only. It should
+ * only be called from the org.textmining.text.extraction.WordExtractor because
+ * it will automatically determine the version.
+ *
+ * @author Ryan Ackley
+ */
+class Word6Extractor
+{
+
+  public Word6Extractor()
+  {
+  }
+
+  /**
+   * Extracts the text
+   *
+   * @param mainStream The POIFS document stream entitled "WordDocument".
+   *
+   * @return The text from the document
+   * @throws Exception If there are any unexpected exceptions.
+   */
+  public String extractText(byte[] mainStream) throws Exception
+  {
+    int fcMin = LittleEndian.getInt(mainStream, 0x18);
+    int fcMax = LittleEndian.getInt(mainStream, 0x1C);
+
+    int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8);
+    int chpTableSize = LittleEndian.getInt(mainStream, 0xbc);
+
+    // get a list of character properties
+    Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset,
+      chpTableSize, fcMin);
+    List textRuns = chpTable.getTextRuns();
+
+    // iterate through the
+    WordTextBuffer finalTextBuf = new WordTextBuffer();
+    Iterator runsIt = textRuns.iterator();
+    while(runsIt.hasNext())
+    {
+      CHPX chpx = (CHPX)runsIt.next();
+      int runStart = chpx.getStart() + fcMin;
+      int runEnd = chpx.getEnd() + fcMin;
+
+      if (!isDeleted(chpx.getGrpprl()))
+      {
+        String s = new String(mainStream, runStart, Math.min(runEnd, fcMax) - 
runStart, "Cp1252");
+        finalTextBuf.append(s);
+        if (runEnd >= fcMax)
+        {
+          break;
+        }
+      }
+    }
+
+    return finalTextBuf.toString();
+  }
+
+  /**
+   * Used to determine if a run of text has been deleted.
+   * @param grpprl The list of sprms for this run of text.
+   * @return
+   */
+  private boolean isDeleted(byte[] grpprl)
+  {
+    int offset = 0;
+    boolean deleted = false;
+    while (offset < grpprl.length)
+    {
+      switch (LittleEndian.getUnsignedByte(grpprl, offset++))
+      {
+        case 65:
+          deleted = grpprl[offset++] != 0;
+          break;
+        case 66:
+          offset++;
+          break;
+        case 67:
+          offset++;
+          break;
+        case 68:
+          offset += grpprl[offset];
+          break;
+        case 69:
+          offset += 2;
+          break;
+        case 70:
+          offset += 4;
+          break;
+        case 71:
+          offset++;
+          break;
+        case 72:
+          offset += 2;
+          break;
+        case 73:
+          offset += 3;
+          break;
+        case 74:
+          offset += grpprl[offset];
+          break;
+        case 75:
+          offset++;
+          break;
+        case 80:
+          offset += 2;
+          break;
+        case 81:
+          offset += grpprl[offset];
+          break;
+        case 82:
+          offset += grpprl[offset];
+          break;
+        case 83:
+          break;
+        case 85:
+          offset++;
+          break;
+        case 86:
+          offset++;
+          break;
+        case 87:
+          offset++;
+          break;
+        case 88:
+          offset++;
+          break;
+        case 89:
+          offset++;
+          break;
+        case 90:
+          offset++;
+          break;
+        case 91:
+          offset++;
+          break;
+        case 92:
+          offset++;
+          break;
+        case 93:
+          offset += 2;
+          break;
+        case 94:
+          offset++;
+          break;
+        case 95:
+          offset += 3;
+          break;
+        case 96:
+          offset += 2;
+          break;
+        case 97:
+          offset += 2;
+          break;
+        case 98:
+          offset++;
+          break;
+        case 99:
+          offset++;
+          break;
+        case 100:
+          offset++;
+          break;
+        case 101:
+          offset++;
+          break;
+        case 102:
+          offset++;
+          break;
+        case 103:
+          offset += grpprl[offset];
+          break;
+        case 104:
+          offset++;
+          break;
+        case 105:
+          offset += grpprl[offset];
+          break;
+        case 106:
+          offset += grpprl[offset];
+          break;
+        case 107:
+          offset += 2;
+          break;
+        case 108:
+          offset += grpprl[offset];
+          break;
+        case 109:
+          offset += 2;
+          break;
+        case 110:
+          offset += 2;
+          break;
+        case 117:
+          offset++;
+          break;
+        case 118:
+          offset++;
+          break;
+
+      }
+    }
+    return deleted;
+  }
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordExtractor.java
 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordExtractor.java
--- 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordExtractor.java
        1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordExtractor.java
   2004-06-14 22:08:23.000000000 -0700
@@ -0,0 +1,217 @@
+/*  Copyright 2004 Ryan Ackley
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package net.nutch.parse.msword;
+
+import org.apache.poi.poifs.filesystem.*;
+import org.apache.poi.util.LittleEndian;
+import org.apache.poi.hwpf.model.*;
+import org.apache.poi.hwpf.sprm.*;
+
+import java.util.*;
+import java.io.*;
+
+/**
+ * This class extracts the text from a Word 6.0/95/97/2000/XP word doc
+ *
+ * @author Ryan Ackley
+ */
+public class WordExtractor
+{
+
+  /**
+   * Constructor
+   */
+  public WordExtractor()
+  {
+  }
+
+  /**
+   * Gets the text from a Word document.
+   *
+   * @param in The InputStream representing the Word file.
+   */
+  public String extractText(InputStream in) throws Exception
+  {
+    ArrayList text = new ArrayList();
+    POIFSFileSystem fsys = new POIFSFileSystem(in);
+
+    // load our POIFS document streams.
+    DocumentEntry headerProps =
+        (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
+    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
+    byte[] header = new byte[headerProps.getSize()];
+
+
+    din.read(header);
+    din.close();
+
+    int info = LittleEndian.getShort(header, 0xa);
+    if ((info & 0x4) != 0)
+    {
+      throw new FastSavedException("Fast-saved files are unsupported at this time");
+    }
+    if ((info & 0x100) != 0)
+    {
+      throw new PasswordProtectedException("This document is password protected");
+    }
+
+    // determine the version of Word this document came from.
+    int nFib = LittleEndian.getShort(header, 0x2);
+    switch (nFib)
+    {
+      case 101:
+      case 102:
+      case 103:
+      case 104:
+        // this is a Word 6.0 doc send it to the extractor for that version.
+        Word6Extractor oldExtractor = new Word6Extractor();
+        return oldExtractor.extractText(header);
+    }
+
+    //Get the information we need from the header
+    boolean useTable1 = (info & 0x200) != 0;
+
+    //get the location of the piece table
+    int complexOffset = LittleEndian.getInt(header, 0x1a2);
+
+    // determine which table stream we must use.
+    String tableName = null;
+    if (useTable1)
+    {
+      tableName = "1Table";
+    }
+    else
+    {
+      tableName = "0Table";
+    }
+
+    DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
+    byte[] tableStream = new byte[table.getSize()];
+
+    din = fsys.createDocumentInputStream(tableName);
+
+    din.read(tableStream);
+    din.close();
+
+    int chpOffset = LittleEndian.getInt(header, 0xfa);
+    int chpSize = LittleEndian.getInt(header, 0xfe);
+    int fcMin = LittleEndian.getInt(header, 0x18);
+    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);
+
+    // load our text pieces and our character runs
+    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, 
fcMin);
+    TextPieceTable tpt = cft.getTextPieceTable();
+    List textPieces = tpt.getTextPieces();
+
+    // make the POIFS objects available for garbage collection
+    din = null;
+    fsys = null;
+    table = null;
+    headerProps = null;
+
+    List textRuns = cbt.getTextRuns();
+    Iterator runIt = textRuns.iterator();
+    Iterator textIt = textPieces.iterator();
+
+    TextPiece currentPiece = (TextPiece)textIt.next();
+    int currentTextStart = currentPiece.getStart();
+    int currentTextEnd = currentPiece.getEnd();
+
+    WordTextBuffer finalTextBuf = new WordTextBuffer();
+
+    // iterate through all text runs extract the text only if they haven't been
+    // deleted
+    while (runIt.hasNext())
+    {
+      CHPX chpx = (CHPX)runIt.next();
+      boolean deleted = isDeleted(chpx.getGrpprl());
+      if (deleted)
+      {
+        continue;
+      }
+
+      int runStart = chpx.getStart();
+      int runEnd = chpx.getEnd();
+
+      while (runStart >= currentTextEnd)
+      {
+        currentPiece = (TextPiece) textIt.next ();
+        currentTextStart = currentPiece.getStart ();
+        currentTextEnd = currentPiece.getEnd ();
+      }
+
+      if (runEnd < currentTextEnd)
+      {
+        String str = currentPiece.substring(runStart - currentTextStart, runEnd - 
currentTextStart);
+        finalTextBuf.append(str);
+      }
+      else if (runEnd > currentTextEnd)
+      {
+        while (runEnd > currentTextEnd)
+        {
+          String str = currentPiece.substring(runStart - currentTextStart,
+                                   currentTextEnd - currentTextStart);
+          finalTextBuf.append(str);
+          if (textIt.hasNext())
+          {
+            currentPiece = (TextPiece) textIt.next ();
+            currentTextStart = currentPiece.getStart ();
+            runStart = currentTextStart;
+            currentTextEnd = currentPiece.getEnd ();
+          }
+          else
+          {
+            return finalTextBuf.toString();
+          }
+        }
+        String str = currentPiece.substring(0, runEnd - currentTextStart);
+        finalTextBuf.append(str);
+      }
+      else
+      {
+        String str = currentPiece.substring(runStart - currentTextStart, runEnd - 
currentTextStart);
+        if (textIt.hasNext())
+        {
+          currentPiece = (TextPiece) textIt.next();
+          currentTextStart = currentPiece.getStart();
+          currentTextEnd = currentPiece.getEnd();
+        }
+        finalTextBuf.append(str);
+      }
+    }
+    return finalTextBuf.toString();
+  }
+
+  /**
+   * Used to determine if a run of text has been deleted.
+   *
+   * @param grpprl The list of sprms for a particular run of text.
+   * @return true if this run of text has been deleted.
+   */
+  private boolean isDeleted(byte[] grpprl)
+  {
+    SprmIterator iterator = new SprmIterator(grpprl,0);
+    while (iterator.hasNext())
+    {
+      SprmOperation op = iterator.next();
+      // 0 is the operation that signals a FDelRMark operation
+      if (op.getOperation() == 0 && op.getOperand() != 0)
+      {
+        return true;
+      }
+    }
+    return false;
+  }
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordTextBuffer.java
 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordTextBuffer.java
--- 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordTextBuffer.java
       1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordTextBuffer.java
  2004-06-14 15:44:59.000000000 -0700
@@ -0,0 +1,66 @@
+/*  Copyright 2004 Ryan Ackley
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package net.nutch.parse.msword;
+
+
+/**
+ * This class acts as a StringBuffer for text from a word document. It allows
+ * processing of character before they
+ * @author Ryan Ackley
+ * @version 1.0
+ */
+public class WordTextBuffer
+{
+  StringBuffer _buf;
+  boolean _hold;
+
+  public WordTextBuffer()
+  {
+    _buf = new StringBuffer();
+    _hold = false;
+  }
+
+  public void append(String text)
+  {
+    char[] letters = text.toCharArray();
+    for (int x = 0; x < letters.length; x++)
+    {
+      switch(letters[x])
+      {
+        case '\r':
+          _buf.append("\r\n");
+          break;
+        case 0x13:
+          _hold = true;
+          break;
+        case 0x14:
+          _hold = false;
+          break;
+        default:
+          if (!_hold)
+          {
+            _buf.append(letters[x]);
+          }
+          break;
+      }
+    }
+  }
+
+  public String toString()
+  {
+    return _buf.toString();
+  }
+
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordTextPiece.java
 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordTextPiece.java
--- 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordTextPiece.java
        1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-msword/src/java/net/nutch/parse/msword/WordTextPiece.java
   2004-06-14 15:45:15.000000000 -0700
@@ -0,0 +1,54 @@
+/*  Copyright 2004 Ryan Ackley
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package net.nutch.parse.msword;
+
+/**
+ * This class stores info about the data structure describing a chunk of text
+ * in a Word document. Specifically, whether or not a Range of text uses
+ * unicode or Cp1252 encoding.
+ *
+ * @author Ryan Ackley
+ */
+
+class WordTextPiece
+{
+  private int _fcStart;
+  private boolean _usesUnicode;
+  private int _length;
+
+  public WordTextPiece(int start, int length, boolean unicode)
+  {
+    _usesUnicode = unicode;
+    _length = length;
+    _fcStart = start;
+  }
+   public boolean usesUnicode()
+  {
+      return _usesUnicode;
+  }
+
+  public int getStart()
+  {
+      return _fcStart;
+  }
+  public int getLength()
+  {
+    return _length;
+  }
+
+
+
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-pdf/build.xml 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-pdf/build.xml
--- nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-pdf/build.xml   
 1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-pdf/build.xml  
     2004-06-13 22:22:11.000000000 -0700
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="parse-pdf" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+</project>
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-pdf/plugin.xml 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-pdf/plugin.xml
--- nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-pdf/plugin.xml  
 1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-pdf/plugin.xml 
     2004-06-14 11:03:46.000000000 -0700
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-pdf"
+   name="Pdf Parse Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <extension-point
+      id="net.nutch.parse.Parser"
+      name="Nutch Content Parser"/>
+
+   <!--
+   <extension-point
+      id="net.nutch.parse.PdfParseFilter"
+      name="PDF Parse Filter"/>
+   -->
+
+   <runtime>
+      <library name="parse-pdf.jar">
+         <export name="*"/>
+      </library>
+      <library name="PDFBox-0.6.5.jar"/>
+      <library name="log4j.jar"/>
+   </runtime>
+
+   <extension id="net.nutch.parse.pdf"
+              name="PdfParse"
+              point="net.nutch.parse.Parser">
+
+      <implementation id="net.nutch.parse.pdf.PdfParser"
+                      class="net.nutch.parse.pdf.PdfParser"
+                      contentType="application/pdf"
+                      pathSuffix=""/>
+
+   </extension>
+
+</plugin>
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-pdf/src/java/net/nutch/parse/pdf/PdfParser.java
 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-pdf/src/java/net/nutch/parse/pdf/PdfParser.java
--- 
nutch-cvs-20040605.patched.with.20040605.20040606/src/plugin/parse-pdf/src/java/net/nutch/parse/pdf/PdfParser.java
  1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.patched.with.20040605.20040606.xing/src/plugin/parse-pdf/src/java/net/nutch/parse/pdf/PdfParser.java
     2004-06-14 14:52:14.000000000 -0700
@@ -0,0 +1,172 @@
+/* Copyright (c) 2004 The Nutch Organization.  All rights reserved.   */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+package net.nutch.parse.pdf;
+
+import org.pdfbox.encryption.DecryptDocument;
+import org.pdfbox.pdfparser.PDFParser;
+import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.pdmodel.PDDocumentInformation;
+import org.pdfbox.util.PDFTextStripper;
+
+import org.pdfbox.exceptions.CryptographyException;
+import org.pdfbox.exceptions.InvalidPasswordException;
+
+import net.nutch.protocol.Content;
+import net.nutch.util.LogFormatter;
+import net.nutch.parse.Parser;
+import net.nutch.parse.Parse;
+import net.nutch.parse.ParseData;
+import net.nutch.parse.ParseImpl;
+import net.nutch.parse.Outlink;
+import net.nutch.parse.ParseException;
+
+import java.text.SimpleDateFormat;
+import java.util.Calendar;
+
+import java.util.Properties;
+import java.util.logging.Logger;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+
+/*********************************************
+ * parser for mime type application/pdf.
+ * It is based on org.pdfbox.*. We have to see how well it does the job.
+ * 
+ * @author John Xing
+ *
+ * Note on 20040614 by Xing:
+ * Some codes are stacked here for convenience (see inline comments).
+ * They may be moved to more appropriate places when new codebase
+ * stabilizes, especially after code for indexing is written.
+ *
+ *********************************************/
+
+public class PdfParser implements Parser {
+  public static final Logger LOG =
+    LogFormatter.getLogger("net.nutch.parse.pdf");
+
+  public PdfParser () {
+    // redirect org.apache.log4j.Logger to java's native logger, in order
+    // to, at least, suppress annoying log4j warnings.
+    // Note on 20040614 by Xing:
+    // log4j is used by pdfbox. This snippet'd better be moved
+    // to a common place shared by all parsers that use log4j.
+    org.apache.log4j.Logger rootLogger =
+      org.apache.log4j.Logger.getRootLogger();
+
+    rootLogger.setLevel(org.apache.log4j.Level.INFO);
+
+    org.apache.log4j.Appender appender = new org.apache.log4j.WriterAppender(
+      new org.apache.log4j.SimpleLayout(),
+      net.nutch.util.LogFormatter.getLogStream(
+        this.LOG, java.util.logging.Level.INFO));
+
+    rootLogger.addAppender(appender);
+  }
+
+  public Parse getParse(Content content) throws ParseException {
+
+    // check that contentType is one we can handle
+    String contentType = content.getContentType();
+    if (contentType != null && !contentType.startsWith("application/pdf"))
+      throw new ParseException(
+        "Content-Type not application/pdf: "+contentType);
+
+    // in memory representation of pdf file
+    PDDocument pdf = null;
+
+    String text = null;
+    String title = null;
+
+    try {
+
+      byte[] raw = content.getContent();
+
+      String contentLength =
+        (String)content.getMetaData().get("Content-Length");
+      if (contentLength != null
+            && raw.length != Integer.parseInt(contentLength)) {
+          throw new ParseException("Content truncated at "+raw.length
+            +" bytes. Parser can't handle incomplete pdf file.");
+      }
+
+      PDFParser parser = new PDFParser(
+        new ByteArrayInputStream(raw));
+      parser.parse();
+
+      pdf = parser.getPDDocument();
+
+      if (pdf.isEncrypted()) {
+        DecryptDocument decryptor = new DecryptDocument(pdf);
+        //Just try using the default password and move on
+        decryptor.decryptDocument("");
+      }
+
+      // collect text
+      PDFTextStripper stripper = new PDFTextStripper();
+      text = stripper.getText(pdf);
+
+      // collect title
+      PDDocumentInformation info = pdf.getDocumentInformation();
+      title = info.getTitle();
+      // more useful info, currently not used. please keep them for future use.
+      // pdf.getPageCount();
+      // info.getAuthor()
+      // info.getSubject()
+      // info.getKeywords()
+      // info.getCreator()
+      // info.getProducer()
+      // info.getTrapped()
+      // formatDate(info.getCreationDate())
+      // formatDate(info.getModificationDate())
+
+    } catch (ParseException e) {
+      throw e;
+    } catch (CryptographyException e) {
+      throw new ParseException("Error decrypting document. "+e);
+    } catch (InvalidPasswordException e) {
+      throw new ParseException("Can't decrypt document. "+e);
+    } catch (Exception e) { // run time exception
+      throw new ParseException("Can't be handled as pdf document. "+e);
+    } finally {
+      try {
+        if (pdf != null)
+          pdf.close();
+        } catch (IOException e) {
+          // nothing to do
+        }
+    }
+
+    if (text == null)
+      text = "";
+
+    if (title == null)
+      title = "";
+
+    // collect outlink
+    Outlink[] outlinks = new Outlink[0];
+
+    // collect meta data
+    Properties metaData = new Properties();
+    metaData.putAll(content.getMetaData()); // copy through
+
+    ParseData parseData = new ParseData(title, outlinks, metaData);
+    return new ParseImpl(text, parseData);
+    // any filter?
+    //return HtmlParseFilters.filter(content, parse, root);
+  }
+
+  // format date
+  // currently not used. please keep it for future use.
+  private String formatDate(Calendar date) {
+    String retval = null;
+    if(date != null) {
+      SimpleDateFormat formatter = new SimpleDateFormat();
+      retval = formatter.format(date.getTime());
+    }
+    return retval;
+  }
+
+}
__________________________________________
http://www.neasys.com - A Good Place to Be
Come to visit us today!


-------------------------------------------------------
This SF.Net email is sponsored by The 2004 JavaOne(SM) Conference
Learn from the experts at JavaOne(SM), Sun's Worldwide Java Developer
Conference, June 28 - July 1 at the Moscone Center in San Francisco, CA
REGISTER AND SAVE! http://java.sun.com/javaone/sf Priority Code NWMGYKND
_______________________________________________
Nutch-developers mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-developers

[Nutch-dev] patch for pdf & msword parsing

Reply via email to