Author: jukka
Date: Sun Mar  9 03:27:18 2008
New Revision: 635208

URL: http://svn.apache.org/viewvc?rev=635208&view=rev
Log:
TIKA-123: Structured MS Office parsing
    - Moved property file parsing to a separate Parser class

Added:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java
Modified:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=635208&r1=635207&r2=635208&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 Sun Mar  9 03:27:18 2008
@@ -16,14 +16,11 @@
  */
 package org.apache.tika.parser.microsoft;
 
-// JDK imports
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 
 import org.apache.poi.hpsf.DocumentSummaryInformation;
-import org.apache.poi.hpsf.HPSFException;
-import org.apache.poi.hpsf.PropertySet;
-import org.apache.poi.hpsf.PropertySetFactory;
 import org.apache.poi.hpsf.SummaryInformation;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.TikaException;
@@ -31,6 +28,7 @@
 import org.apache.tika.parser.Parser;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
 
 /**
  * Defines a Microsoft document content extractor.
@@ -70,106 +68,17 @@
             throws IOException, SAXException, TikaException;
 
     private void getMetadata(
-            POIFSFileSystem filesystem, String name, Metadata metadata) {
+            POIFSFileSystem filesystem, String name, Metadata metadata)
+            throws IOException, SAXException, TikaException {
         try {
             InputStream stream = filesystem.createDocumentInputStream(name);
             try {
-                getMetadata(stream, metadata);
+                new PropertyParser().parse(stream, new DefaultHandler(), 
metadata);
             } finally {
                 stream.close();
             }
-        } catch (Exception e) {
+        } catch (FileNotFoundException e) {
             // summary information not available, ignore
-        }
-    }
-
-    private void getMetadata(InputStream stream, Metadata metadata)
-            throws HPSFException, IOException {
-        PropertySet set = PropertySetFactory.create(stream);
-        if (set instanceof SummaryInformation) {
-            getMetadata((SummaryInformation) set, metadata);
-        } else if (set instanceof DocumentSummaryInformation) {
-            getMetadata((DocumentSummaryInformation) set, metadata);
-        }
-    }
-
-    private void getMetadata(
-            SummaryInformation information, Metadata metadata) {
-        if (information.getTitle() != null) {
-            metadata.set(Metadata.TITLE, information.getTitle());
-        }
-        if (information.getAuthor() != null) {
-            metadata.set(Metadata.AUTHOR, information.getAuthor());
-        }
-        if (information.getKeywords() != null) {
-            metadata.set(Metadata.KEYWORDS, information.getKeywords());
-        }
-        if (information.getSubject() != null) {
-            metadata.set(Metadata.SUBJECT, information.getSubject());
-        }
-        if (information.getLastAuthor() != null) {
-            metadata.set(Metadata.LAST_AUTHOR, information.getLastAuthor());
-        }
-        if (information.getComments() != null) {
-            metadata.set(Metadata.COMMENTS, information.getComments());
-        }
-        if (information.getTemplate() != null) {
-            metadata.set(Metadata.TEMPLATE, information.getTemplate());
-        }
-        if (information.getApplicationName() != null) {
-            metadata.set(
-                    Metadata.APPLICATION_NAME,
-                    information.getApplicationName());
-        }
-        if (information.getRevNumber() != null) {
-            metadata.set(Metadata.REVISION_NUMBER, information.getRevNumber());
-        }
-        if (information.getCreateDateTime() != null) {
-            metadata.set(
-                    "creationdate",
-                    information.getCreateDateTime().toString());
-        }
-        if (information.getCharCount() > 0) {
-            metadata.set(
-                    Metadata.CHARACTER_COUNT,
-                    Integer.toString(information.getCharCount()));
-        }
-        if (information.getEditTime() > 0) {
-            metadata.set("edittime", Long.toString(information.getEditTime()));
-        }
-        if (information.getLastSaveDateTime() != null) {
-            metadata.set(
-                    Metadata.LAST_SAVED,
-                    information.getLastSaveDateTime().toString());
-        }
-        if (information.getPageCount() > 0) {
-            metadata.set(
-                    Metadata.PAGE_COUNT,
-                    Integer.toString(information.getPageCount()));
-        }
-        if (information.getSecurity() > 0) {
-            metadata.set(
-                    "security", Integer.toString(information.getSecurity()));
-        }
-        if (information.getWordCount() > 0) {
-            metadata.set(
-                    Metadata.WORD_COUNT,
-                    Integer.toString(information.getWordCount()));
-        }
-        if (information.getLastPrinted() != null) {
-            metadata.set(
-                    Metadata.LAST_PRINTED,
-                    information.getLastPrinted().toString());
-        }
-    }
-
-    private void getMetadata(
-            DocumentSummaryInformation information, Metadata metadata) {
-        if (information.getCompany() != null) {
-            metadata.set("company", information.getCompany());
-        }
-        if (information.getManager() != null) {
-            metadata.set("manager", information.getManager());
         }
     }
 

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java?rev=635208&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java
 (added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java
 Sun Mar  9 03:27:18 2008
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.NoPropertySetStreamException;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for HPSF property streams within Microsoft Office files.
+ */
+public class PropertyParser implements Parser {
+
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        try {
+            PropertySet properties =
+                new PropertySet(IOUtils.toByteArray(stream));
+            if (properties.isSummaryInformation()) {
+                SummaryInformation information = new 
SummaryInformation(properties);
+                set(metadata, Metadata.TITLE, information.getTitle());
+                set(metadata, Metadata.AUTHOR, information.getAuthor());
+                set(metadata, Metadata.KEYWORDS, information.getKeywords());
+                set(metadata, Metadata.SUBJECT, information.getSubject());
+                set(metadata, Metadata.LAST_AUTHOR, 
information.getLastAuthor());
+                set(metadata, Metadata.COMMENTS, information.getComments());
+                set(metadata, Metadata.TEMPLATE, information.getTemplate());
+                set(metadata, Metadata.APPLICATION_NAME, 
information.getApplicationName());
+                set(metadata, Metadata.REVISION_NUMBER, 
information.getRevNumber());
+                set(metadata, "creationdate", information.getCreateDateTime());
+                set(metadata, Metadata.CHARACTER_COUNT, 
information.getCharCount());
+                set(metadata, "edittime", information.getEditTime());
+                set(metadata, Metadata.LAST_SAVED, 
information.getLastSaveDateTime());
+                set(metadata, Metadata.PAGE_COUNT, information.getPageCount());
+                set(metadata, "security", information.getSecurity());
+                set(metadata, Metadata.WORD_COUNT, information.getWordCount());
+                set(metadata, Metadata.LAST_PRINTED, 
information.getLastPrinted());
+            }
+            if (properties.isDocumentSummaryInformation()) {
+                DocumentSummaryInformation information = new 
DocumentSummaryInformation(properties);
+                set(metadata, "company", information.getCompany());
+                set(metadata, "manager", information.getManager());
+            }
+
+            // No content, just metadata
+            XHTMLContentHandler xhtml =
+                new XHTMLContentHandler(handler, metadata);
+            xhtml.startDocument();
+            xhtml.endDocument();
+        } catch (NoPropertySetStreamException e) {
+            throw new TikaException("Not a HPSF document", e);
+        } catch (UnexpectedPropertySetTypeException e) {
+            throw new TikaException("Unexpected HPSF document", e);
+        }
+    }
+
+    private static void set(Metadata metadata, String name, String value) {
+        if (value != null) {
+            metadata.set(name, value);
+        }
+    }
+
+    private static void set(Metadata metadata, String name, Date value) {
+        if (value != null) {
+            metadata.set(name, value.toString());
+        }
+    }
+
+    private static void set(Metadata metadata, String name, long value) {
+        if (value > 0) {
+            metadata.set(name, Long.toString(value));
+        }
+    }
+
+}


Reply via email to