Author: jukka Date: Sun Mar 9 03:27:18 2008 New Revision: 635208 URL: http://svn.apache.org/viewvc?rev=635208&view=rev Log: TIKA-123: Structured MS Office parsing - Moved property file parsing to a separate Parser class
Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=635208&r1=635207&r2=635208&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Sun Mar 9 03:27:18 2008 @@ -16,14 +16,11 @@ */ package org.apache.tika.parser.microsoft; -// JDK imports +import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import org.apache.poi.hpsf.DocumentSummaryInformation; -import org.apache.poi.hpsf.HPSFException; -import org.apache.poi.hpsf.PropertySet; -import org.apache.poi.hpsf.PropertySetFactory; import org.apache.poi.hpsf.SummaryInformation; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.exception.TikaException; @@ -31,6 +28,7 @@ import org.apache.tika.parser.Parser; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; /** * Defines a Microsoft document content extractor. @@ -70,106 +68,17 @@ throws IOException, SAXException, TikaException; private void getMetadata( - POIFSFileSystem filesystem, String name, Metadata metadata) { + POIFSFileSystem filesystem, String name, Metadata metadata) + throws IOException, SAXException, TikaException { try { InputStream stream = filesystem.createDocumentInputStream(name); try { - getMetadata(stream, metadata); + new PropertyParser().parse(stream, new DefaultHandler(), metadata); } finally { stream.close(); } - } catch (Exception e) { + } catch (FileNotFoundException e) { // summary information not available, ignore - } - } - - private void getMetadata(InputStream stream, Metadata metadata) - throws HPSFException, IOException { - PropertySet set = PropertySetFactory.create(stream); - if (set instanceof SummaryInformation) { - getMetadata((SummaryInformation) set, metadata); - } else if (set instanceof DocumentSummaryInformation) { - getMetadata((DocumentSummaryInformation) set, metadata); - } - } - - private void getMetadata( - SummaryInformation information, Metadata metadata) { - if (information.getTitle() != null) { - metadata.set(Metadata.TITLE, information.getTitle()); - } - if (information.getAuthor() != null) { - metadata.set(Metadata.AUTHOR, information.getAuthor()); - } - if (information.getKeywords() != null) { - metadata.set(Metadata.KEYWORDS, information.getKeywords()); - } - if (information.getSubject() != null) { - metadata.set(Metadata.SUBJECT, information.getSubject()); - } - if (information.getLastAuthor() != null) { - metadata.set(Metadata.LAST_AUTHOR, information.getLastAuthor()); - } - if (information.getComments() != null) { - metadata.set(Metadata.COMMENTS, information.getComments()); - } - if (information.getTemplate() != null) { - metadata.set(Metadata.TEMPLATE, information.getTemplate()); - } - if (information.getApplicationName() != null) { - metadata.set( - Metadata.APPLICATION_NAME, - information.getApplicationName()); - } - if (information.getRevNumber() != null) { - metadata.set(Metadata.REVISION_NUMBER, information.getRevNumber()); - } - if (information.getCreateDateTime() != null) { - metadata.set( - "creationdate", - information.getCreateDateTime().toString()); - } - if (information.getCharCount() > 0) { - metadata.set( - Metadata.CHARACTER_COUNT, - Integer.toString(information.getCharCount())); - } - if (information.getEditTime() > 0) { - metadata.set("edittime", Long.toString(information.getEditTime())); - } - if (information.getLastSaveDateTime() != null) { - metadata.set( - Metadata.LAST_SAVED, - information.getLastSaveDateTime().toString()); - } - if (information.getPageCount() > 0) { - metadata.set( - Metadata.PAGE_COUNT, - Integer.toString(information.getPageCount())); - } - if (information.getSecurity() > 0) { - metadata.set( - "security", Integer.toString(information.getSecurity())); - } - if (information.getWordCount() > 0) { - metadata.set( - Metadata.WORD_COUNT, - Integer.toString(information.getWordCount())); - } - if (information.getLastPrinted() != null) { - metadata.set( - Metadata.LAST_PRINTED, - information.getLastPrinted().toString()); - } - } - - private void getMetadata( - DocumentSummaryInformation information, Metadata metadata) { - if (information.getCompany() != null) { - metadata.set("company", information.getCompany()); - } - if (information.getManager() != null) { - metadata.set("manager", information.getManager()); } } Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java?rev=635208&view=auto ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java (added) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java Sun Mar 9 03:27:18 2008 @@ -0,0 +1,103 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Date; + +import org.apache.commons.io.IOUtils; +import org.apache.poi.hpsf.DocumentSummaryInformation; +import org.apache.poi.hpsf.NoPropertySetStreamException; +import org.apache.poi.hpsf.PropertySet; +import org.apache.poi.hpsf.SummaryInformation; +import org.apache.poi.hpsf.UnexpectedPropertySetTypeException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Parser for HPSF property streams within Microsoft Office files. + */ +public class PropertyParser implements Parser { + + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata) + throws IOException, SAXException, TikaException { + try { + PropertySet properties = + new PropertySet(IOUtils.toByteArray(stream)); + if (properties.isSummaryInformation()) { + SummaryInformation information = new SummaryInformation(properties); + set(metadata, Metadata.TITLE, information.getTitle()); + set(metadata, Metadata.AUTHOR, information.getAuthor()); + set(metadata, Metadata.KEYWORDS, information.getKeywords()); + set(metadata, Metadata.SUBJECT, information.getSubject()); + set(metadata, Metadata.LAST_AUTHOR, information.getLastAuthor()); + set(metadata, Metadata.COMMENTS, information.getComments()); + set(metadata, Metadata.TEMPLATE, information.getTemplate()); + set(metadata, Metadata.APPLICATION_NAME, information.getApplicationName()); + set(metadata, Metadata.REVISION_NUMBER, information.getRevNumber()); + set(metadata, "creationdate", information.getCreateDateTime()); + set(metadata, Metadata.CHARACTER_COUNT, information.getCharCount()); + set(metadata, "edittime", information.getEditTime()); + set(metadata, Metadata.LAST_SAVED, information.getLastSaveDateTime()); + set(metadata, Metadata.PAGE_COUNT, information.getPageCount()); + set(metadata, "security", information.getSecurity()); + set(metadata, Metadata.WORD_COUNT, information.getWordCount()); + set(metadata, Metadata.LAST_PRINTED, information.getLastPrinted()); + } + if (properties.isDocumentSummaryInformation()) { + DocumentSummaryInformation information = new DocumentSummaryInformation(properties); + set(metadata, "company", information.getCompany()); + set(metadata, "manager", information.getManager()); + } + + // No content, just metadata + XHTMLContentHandler xhtml = + new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + xhtml.endDocument(); + } catch (NoPropertySetStreamException e) { + throw new TikaException("Not a HPSF document", e); + } catch (UnexpectedPropertySetTypeException e) { + throw new TikaException("Unexpected HPSF document", e); + } + } + + private static void set(Metadata metadata, String name, String value) { + if (value != null) { + metadata.set(name, value); + } + } + + private static void set(Metadata metadata, String name, Date value) { + if (value != null) { + metadata.set(name, value.toString()); + } + } + + private static void set(Metadata metadata, String name, long value) { + if (value > 0) { + metadata.set(name, Long.toString(value)); + } + } + +}