Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Fri May 29 14:36:21 2015 @@ -45,198 +45,198 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; public class HSLFExtractor extends AbstractPOIFSExtractor { - public HSLFExtractor(ParseContext context) { - super(context); - } - - protected void parse( - NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml) - throws IOException, SAXException, TikaException { - parse(filesystem.getRoot(), xhtml); - } - - protected void parse( - DirectoryNode root, XHTMLContentHandler xhtml) - throws IOException, SAXException, TikaException { - HSLFSlideShow ss = new HSLFSlideShow(root); - SlideShow _show = new SlideShow(ss); - Slide[] _slides = _show.getSlides(); + public HSLFExtractor(ParseContext context) { + super(context); + } - xhtml.startElement("div", "class", "slideShow"); + protected void parse( + NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + parse(filesystem.getRoot(), xhtml); + } - /* Iterate over slides and extract text */ - for( Slide slide : _slides ) { - xhtml.startElement("div", "class", "slide"); + protected void parse( + DirectoryNode root, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + HSLFSlideShow ss = new HSLFSlideShow(root); + SlideShow _show = new SlideShow(ss); + Slide[] _slides = _show.getSlides(); - // Slide header, if present - HeadersFooters hf = slide.getHeadersFooters(); - if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) { - xhtml.startElement("p", "class", "slide-header"); + xhtml.startElement("div", "class", "slideShow"); - xhtml.characters( hf.getHeaderText() ); + /* Iterate over slides and extract text */ + for (Slide slide : _slides) { + xhtml.startElement("div", "class", "slide"); + + // Slide header, if present + HeadersFooters hf = slide.getHeadersFooters(); + if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) { + xhtml.startElement("p", "class", "slide-header"); - xhtml.endElement("p"); - } + xhtml.characters(hf.getHeaderText()); - // Slide master, if present - extractMaster(xhtml, slide.getMasterSheet()); + xhtml.endElement("p"); + } - // Slide text - { - xhtml.startElement("p", "class", "slide-content"); + // Slide master, if present + extractMaster(xhtml, slide.getMasterSheet()); - textRunsToText(xhtml, slide.getTextRuns()); + // Slide text + { + xhtml.startElement("p", "class", "slide-content"); - xhtml.endElement("p"); - } + textRunsToText(xhtml, slide.getTextRuns()); - // Table text - for (Shape shape: slide.getShapes()){ - if (shape instanceof Table){ - extractTableText(xhtml, (Table)shape); + xhtml.endElement("p"); } - } - // Slide footer, if present - if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { - xhtml.startElement("p", "class", "slide-footer"); + // Table text + for (Shape shape : slide.getShapes()) { + if (shape instanceof Table) { + extractTableText(xhtml, (Table) shape); + } + } - xhtml.characters( hf.getFooterText() ); + // Slide footer, if present + if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { + xhtml.startElement("p", "class", "slide-footer"); - xhtml.endElement("p"); - } + xhtml.characters(hf.getFooterText()); - // Comments, if present - for( Comment comment : slide.getComments() ) { - xhtml.startElement("p", "class", "slide-comment"); - if (comment.getAuthor() != null) { - xhtml.startElement("b"); - xhtml.characters( comment.getAuthor() ); - xhtml.endElement("b"); - - if (comment.getText() != null) { - xhtml.characters( " - "); - } + xhtml.endElement("p"); } - if (comment.getText() != null) { - xhtml.characters( comment.getText() ); + + // Comments, if present + for (Comment comment : slide.getComments()) { + xhtml.startElement("p", "class", "slide-comment"); + if (comment.getAuthor() != null) { + xhtml.startElement("b"); + xhtml.characters(comment.getAuthor()); + xhtml.endElement("b"); + + if (comment.getText() != null) { + xhtml.characters(" - "); + } + } + if (comment.getText() != null) { + xhtml.characters(comment.getText()); + } + xhtml.endElement("p"); } - xhtml.endElement("p"); - } - // Now any embedded resources - handleSlideEmbeddedResources(slide, xhtml); + // Now any embedded resources + handleSlideEmbeddedResources(slide, xhtml); - // TODO Find the Notes for this slide and extract inline + // TODO Find the Notes for this slide and extract inline - // Slide complete - xhtml.endElement("div"); - } + // Slide complete + xhtml.endElement("div"); + } - // All slides done - xhtml.endElement("div"); + // All slides done + xhtml.endElement("div"); /* notes */ - xhtml.startElement("div", "class", "slideNotes"); - HashSet<Integer> seenNotes = new HashSet<Integer>(); - HeadersFooters hf = _show.getNotesHeadersFooters(); - - for (Slide slide : _slides) { - Notes notes = slide.getNotesSheet(); - if (notes == null) { - continue; - } - Integer id = notes._getSheetNumber(); - if (seenNotes.contains(id)) { - continue; - } - seenNotes.add(id); - - // Repeat the Notes header, if set - if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) { - xhtml.startElement("p", "class", "slide-note-header"); - xhtml.characters( hf.getHeaderText() ); - xhtml.endElement("p"); - } - - // Notes text - textRunsToText(xhtml, notes.getTextRuns()); - - // Repeat the notes footer, if set - if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { - xhtml.startElement("p", "class", "slide-note-footer"); - xhtml.characters( hf.getFooterText() ); - xhtml.endElement("p"); - } - } - - handleSlideEmbeddedPictures(_show, xhtml); - - xhtml.endElement("div"); - } - - private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master) throws SAXException { - if (master == null){ - return; - } - Shape[] shapes = master.getShapes(); - if (shapes == null || shapes.length == 0){ - return; - } - - xhtml.startElement("div", "class", "slide-master-content"); - for (Shape shape : shapes){ - if (shape != null && ! MasterSheet.isPlaceholder(shape)){ - if (shape instanceof TextShape){ - TextShape tsh = (TextShape)shape; - String text = tsh.getText(); - if (text != null){ - xhtml.element("p", text); - } - } - } - } - xhtml.endElement("div"); - } - - private void extractTableText(XHTMLContentHandler xhtml, Table shape) throws SAXException { - xhtml.startElement("table"); - for (int row = 0; row < shape.getNumberOfRows(); row++){ - xhtml.startElement("tr"); - for (int col = 0; col < shape.getNumberOfColumns(); col++){ - TableCell cell = shape.getCell(row, col); - //insert empty string for empty cell if cell is null - String txt = ""; - if (cell != null){ - txt = cell.getText(); - } - xhtml.element("td", txt); - } - xhtml.endElement("tr"); - } - xhtml.endElement("table"); - } - - private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs) throws SAXException { - if (runs==null) { - return; - } - - for (TextRun run : runs) { - if (run != null) { - // Leaving in wisdom from TIKA-712 for easy revert. - // Avoid boiler-plate text on the master slide (0 - // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE): - //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) { - String txt = run.getText(); - if (txt != null){ - xhtml.characters(txt); - xhtml.startElement("br"); - xhtml.endElement("br"); - } - } - } - } + xhtml.startElement("div", "class", "slideNotes"); + HashSet<Integer> seenNotes = new HashSet<Integer>(); + HeadersFooters hf = _show.getNotesHeadersFooters(); + + for (Slide slide : _slides) { + Notes notes = slide.getNotesSheet(); + if (notes == null) { + continue; + } + Integer id = notes._getSheetNumber(); + if (seenNotes.contains(id)) { + continue; + } + seenNotes.add(id); + + // Repeat the Notes header, if set + if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) { + xhtml.startElement("p", "class", "slide-note-header"); + xhtml.characters(hf.getHeaderText()); + xhtml.endElement("p"); + } + + // Notes text + textRunsToText(xhtml, notes.getTextRuns()); + + // Repeat the notes footer, if set + if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { + xhtml.startElement("p", "class", "slide-note-footer"); + xhtml.characters(hf.getFooterText()); + xhtml.endElement("p"); + } + } + + handleSlideEmbeddedPictures(_show, xhtml); + + xhtml.endElement("div"); + } + + private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master) throws SAXException { + if (master == null) { + return; + } + Shape[] shapes = master.getShapes(); + if (shapes == null || shapes.length == 0) { + return; + } + + xhtml.startElement("div", "class", "slide-master-content"); + for (Shape shape : shapes) { + if (shape != null && !MasterSheet.isPlaceholder(shape)) { + if (shape instanceof TextShape) { + TextShape tsh = (TextShape) shape; + String text = tsh.getText(); + if (text != null) { + xhtml.element("p", text); + } + } + } + } + xhtml.endElement("div"); + } + + private void extractTableText(XHTMLContentHandler xhtml, Table shape) throws SAXException { + xhtml.startElement("table"); + for (int row = 0; row < shape.getNumberOfRows(); row++) { + xhtml.startElement("tr"); + for (int col = 0; col < shape.getNumberOfColumns(); col++) { + TableCell cell = shape.getCell(row, col); + //insert empty string for empty cell if cell is null + String txt = ""; + if (cell != null) { + txt = cell.getText(); + } + xhtml.element("td", txt); + } + xhtml.endElement("tr"); + } + xhtml.endElement("table"); + } + + private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs) throws SAXException { + if (runs == null) { + return; + } + + for (TextRun run : runs) { + if (run != null) { + // Leaving in wisdom from TIKA-712 for easy revert. + // Avoid boiler-plate text on the master slide (0 + // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE): + //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) { + String txt = run.getText(); + if (txt != null) { + xhtml.characters(txt); + xhtml.startElement("br"); + xhtml.endElement("br"); + } + } + } + } private void handleSlideEmbeddedPictures(SlideShow slideshow, XHTMLContentHandler xhtml) throws TikaException, SAXException, IOException { @@ -262,60 +262,60 @@ public class HSLFExtractor extends Abstr } handleEmbeddedResource( - TikaInputStream.get(pic.getData()), null, null, - mediaType, xhtml, false); + TikaInputStream.get(pic.getData()), null, null, + mediaType, xhtml, false); } } private void handleSlideEmbeddedResources(Slide slide, XHTMLContentHandler xhtml) - throws TikaException, SAXException, IOException { - Shape[] shapes; - try { - shapes = slide.getShapes(); - } catch(NullPointerException e) { - // Sometimes HSLF hits problems - // Please open POI bugs for any you come across! - return; - } - - for( Shape shape : shapes ) { - if( shape instanceof OLEShape ) { - OLEShape oleShape = (OLEShape)shape; - ObjectData data = null; - try { - data = oleShape.getObjectData(); - } catch( NullPointerException e ) { + throws TikaException, SAXException, IOException { + Shape[] shapes; + try { + shapes = slide.getShapes(); + } catch (NullPointerException e) { + // Sometimes HSLF hits problems + // Please open POI bugs for any you come across! + return; + } + + for (Shape shape : shapes) { + if (shape instanceof OLEShape) { + OLEShape oleShape = (OLEShape) shape; + ObjectData data = null; + try { + data = oleShape.getObjectData(); + } catch (NullPointerException e) { /* getObjectData throws NPE some times. */ + } + + if (data != null) { + String objID = Integer.toString(oleShape.getObjectID()); + + // Embedded Object: add a <div + // class="embedded" id="X"/> so consumer can see where + // in the main text each embedded document + // occurred: + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", objID); + xhtml.startElement("div", attributes); + xhtml.endElement("div"); + + TikaInputStream stream = + TikaInputStream.get(data.getData()); + try { + String mediaType = null; + if ("Excel.Chart.8".equals(oleShape.getProgID())) { + mediaType = "application/vnd.ms-excel"; + } + handleEmbeddedResource( + stream, objID, objID, + mediaType, xhtml, false); + } finally { + stream.close(); + } + } } - - if (data != null) { - String objID = Integer.toString(oleShape.getObjectID()); - - // Embedded Object: add a <div - // class="embedded" id="X"/> so consumer can see where - // in the main text each embedded document - // occurred: - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "class", "class", "CDATA", "embedded"); - attributes.addAttribute("", "id", "id", "CDATA", objID); - xhtml.startElement("div", attributes); - xhtml.endElement("div"); - - TikaInputStream stream = - TikaInputStream.get(data.getData()); - try { - String mediaType = null; - if ("Excel.Chart.8".equals(oleShape.getProgID())) { - mediaType = "application/vnd.ms-excel"; - } - handleEmbeddedResource( - stream, objID, objID, - mediaType, xhtml, false); - } finally { - stream.close(); - } - } - } - } - } + } + } }
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java Fri May 29 14:36:21 2015 @@ -71,7 +71,7 @@ public class ListManager extends Abstrac ListData listData = listTables.getListData(paragraph.getList().getLsid()); LevelTuple[] levelTuples = new LevelTuple[listData.getLevels().length]; for (int i = 0; i < listData.getLevels().length; i++) { - levelTuples[i] = buildTuple(i,listData.getLevels()[i]); + levelTuples[i] = buildTuple(i, listData.getLevels()[i]); } lc = new ParagraphLevelCounter(levelTuples); } @@ -89,7 +89,7 @@ public class ListManager extends Abstrac boolean isLegal = false; int start = 1; int restart = -1; - String lvlText = "%"+i+"."; + String lvlText = "%" + i + "."; String numFmt = "decimal"; start = listLevel.getStartAt(); @@ -127,18 +127,18 @@ public class ListManager extends Abstrac StringBuilder sb = new StringBuilder(); int last = 0; - for (int i = 0; i < numberOffsets.length;i++) { - int offset = (int)numberOffsets[i]; + for (int i = 0; i < numberOffsets.length; i++) { + int offset = (int) numberOffsets[i]; - if (offset == 0){ + if (offset == 0) { break; } - sb.append(numberText.substring(last, offset-1)); + sb.append(numberText.substring(last, offset - 1)); //need to add one because newer format //adds one. In .doc, this was the array index; //but in .docx, this is the level number - int lvlNum = (int)numberText.charAt(offset-1)+1; - sb.append("%"+lvlNum); + int lvlNum = (int) numberText.charAt(offset - 1) + 1; + sb.append("%" + lvlNum); last = offset; } if (last < numberText.length()) { @@ -149,29 +149,29 @@ public class ListManager extends Abstrac private String convertToNewNumFormat(int numberFormat) { switch (numberFormat) { - case -1 : + case -1: return "none"; - case 0 : + case 0: return "decimal"; - case 1 : + case 1: return "upperRoman"; - case 2 : + case 2: return "lowerRoman"; - case 3 : + case 3: return "upperLetter"; - case 4 : + case 4: return "lowerLetter"; - case 5 : + case 5: return "ordinal"; - case 22 : + case 22: return "decimalZero"; - case 23 : + case 23: return "bullet"; - case 47 : + case 47: return "none"; - default : + default: //do we really want to silently swallow these uncovered cases? - throw new RuntimeException("NOT COVERED: "+numberFormat); + throw new RuntimeException("NOT COVERED: " + numberFormat); } } } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Fri May 29 14:36:21 2015 @@ -55,7 +55,9 @@ import org.xml.sax.SAXException; */ public class OfficeParser extends AbstractParser { - /** Serial version UID */ + /** + * Serial version UID + */ private static final long serialVersionUID = 7393462244028653479L; private static final Set<MediaType> SUPPORTED_TYPES = @@ -75,64 +77,7 @@ public class OfficeParser extends Abstra POIFSDocumentType.SOLIDWORKS_PART.type, POIFSDocumentType.SOLIDWORKS_ASSEMBLY.type, POIFSDocumentType.SOLIDWORKS_DRAWING.type - ))); - - public enum POIFSDocumentType { - WORKBOOK("xls", MediaType.application("vnd.ms-excel")), - OLE10_NATIVE("ole", POIFSContainerDetector.OLE10_NATIVE), - COMP_OBJ("ole", POIFSContainerDetector.COMP_OBJ), - WORDDOCUMENT("doc", MediaType.application("msword")), - UNKNOWN("unknown", MediaType.application("x-tika-msoffice")), - ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")), - POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")), - PUBLISHER("pub", MediaType.application("x-mspublisher")), - PROJECT("mpp", MediaType.application("vnd.ms-project")), - VISIO("vsd", MediaType.application("vnd.visio")), - WORKS("wps", MediaType.application("vnd.ms-works")), - XLR("xlr", MediaType.application("x-tika-msworks-spreadsheet")), - OUTLOOK("msg", MediaType.application("vnd.ms-outlook")), - SOLIDWORKS_PART("sldprt", MediaType.application("sldworks")), - SOLIDWORKS_ASSEMBLY("sldasm", MediaType.application("sldworks")), - SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks")); - - private final String extension; - private final MediaType type; - - POIFSDocumentType(String extension, MediaType type) { - this.extension = extension; - this.type = type; - } - - public String getExtension() { - return extension; - } - - public MediaType getType() { - return type; - } - - public static POIFSDocumentType detectType(POIFSFileSystem fs) { - return detectType(fs.getRoot()); - } - - public static POIFSDocumentType detectType(NPOIFSFileSystem fs) { - return detectType(fs.getRoot()); - } - - public static POIFSDocumentType detectType(DirectoryEntry node) { - Set<String> names = new HashSet<String>(); - for (Entry entry : node) { - names.add(entry.getName()); - } - MediaType type = POIFSContainerDetector.detect(names, node); - for (POIFSDocumentType poifsType : values()) { - if (type.equals(poifsType.type)) { - return poifsType; - } - } - return UNKNOWN; - } - } + ))); public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; @@ -183,84 +128,84 @@ public class OfficeParser extends Abstra // Parse remaining document entries POIFSDocumentType type = POIFSDocumentType.detectType(root); - if (type!=POIFSDocumentType.UNKNOWN) { + if (type != POIFSDocumentType.UNKNOWN) { setType(metadata, type.getType()); } switch (type) { - case SOLIDWORKS_PART: - case SOLIDWORKS_ASSEMBLY: - case SOLIDWORKS_DRAWING: - break; - case PUBLISHER: - PublisherTextExtractor publisherTextExtractor = - new PublisherTextExtractor(root); - xhtml.element("p", publisherTextExtractor.getText()); - break; - case WORDDOCUMENT: - new WordExtractor(context).parse(root, xhtml); - break; - case POWERPOINT: - new HSLFExtractor(context).parse(root, xhtml); - break; - case WORKBOOK: - case XLR: - Locale locale = context.get(Locale.class, Locale.getDefault()); - new ExcelExtractor(context, metadata).parse(root, xhtml, locale); - break; - case PROJECT: - // We currently can't do anything beyond the metadata - break; - case VISIO: - VisioTextExtractor visioTextExtractor = - new VisioTextExtractor(root); - for (String text : visioTextExtractor.getAllText()) { - xhtml.element("p", text); - } - break; - case OUTLOOK: - OutlookExtractor extractor = - new OutlookExtractor(root, context); - - extractor.parse(xhtml, metadata); - break; - case ENCRYPTED: - EncryptionInfo info = new EncryptionInfo(root); - Decryptor d = Decryptor.getInstance(info); - - try { - // By default, use the default Office Password - String password = Decryptor.DEFAULT_PASSWORD; - - // If they supplied a Password Provider, ask that for the password, - // and use the provider given one if available (stick with default if not) - PasswordProvider passwordProvider = context.get(PasswordProvider.class); - if (passwordProvider != null) { - String suppliedPassword = passwordProvider.getPassword(metadata); - if (suppliedPassword != null) { - password = suppliedPassword; - } - } - - // Check if we've the right password or not - if (!d.verifyPassword(password)) { - throw new EncryptedDocumentException(); - } - - // Decrypt the OLE2 stream, and delegate the resulting OOXML - // file to the regular OOXML parser for normal handling - OOXMLParser parser = new OOXMLParser(); - - parser.parse(d.getDataStream(root), new EmbeddedContentHandler( - new BodyContentHandler(xhtml)), - metadata, context); - } catch (GeneralSecurityException ex) { - throw new EncryptedDocumentException(ex); - } - default: - // For unsupported / unhandled types, just the metadata - // is extracted, which happened above - break; + case SOLIDWORKS_PART: + case SOLIDWORKS_ASSEMBLY: + case SOLIDWORKS_DRAWING: + break; + case PUBLISHER: + PublisherTextExtractor publisherTextExtractor = + new PublisherTextExtractor(root); + xhtml.element("p", publisherTextExtractor.getText()); + break; + case WORDDOCUMENT: + new WordExtractor(context).parse(root, xhtml); + break; + case POWERPOINT: + new HSLFExtractor(context).parse(root, xhtml); + break; + case WORKBOOK: + case XLR: + Locale locale = context.get(Locale.class, Locale.getDefault()); + new ExcelExtractor(context, metadata).parse(root, xhtml, locale); + break; + case PROJECT: + // We currently can't do anything beyond the metadata + break; + case VISIO: + VisioTextExtractor visioTextExtractor = + new VisioTextExtractor(root); + for (String text : visioTextExtractor.getAllText()) { + xhtml.element("p", text); + } + break; + case OUTLOOK: + OutlookExtractor extractor = + new OutlookExtractor(root, context); + + extractor.parse(xhtml, metadata); + break; + case ENCRYPTED: + EncryptionInfo info = new EncryptionInfo(root); + Decryptor d = Decryptor.getInstance(info); + + try { + // By default, use the default Office Password + String password = Decryptor.DEFAULT_PASSWORD; + + // If they supplied a Password Provider, ask that for the password, + // and use the provider given one if available (stick with default if not) + PasswordProvider passwordProvider = context.get(PasswordProvider.class); + if (passwordProvider != null) { + String suppliedPassword = passwordProvider.getPassword(metadata); + if (suppliedPassword != null) { + password = suppliedPassword; + } + } + + // Check if we've the right password or not + if (!d.verifyPassword(password)) { + throw new EncryptedDocumentException(); + } + + // Decrypt the OLE2 stream, and delegate the resulting OOXML + // file to the regular OOXML parser for normal handling + OOXMLParser parser = new OOXMLParser(); + + parser.parse(d.getDataStream(root), new EmbeddedContentHandler( + new BodyContentHandler(xhtml)), + metadata, context); + } catch (GeneralSecurityException ex) { + throw new EncryptedDocumentException(ex); + } + default: + // For unsupported / unhandled types, just the metadata + // is extracted, which happened above + break; } } @@ -268,4 +213,61 @@ public class OfficeParser extends Abstra metadata.set(Metadata.CONTENT_TYPE, type.toString()); } + public enum POIFSDocumentType { + WORKBOOK("xls", MediaType.application("vnd.ms-excel")), + OLE10_NATIVE("ole", POIFSContainerDetector.OLE10_NATIVE), + COMP_OBJ("ole", POIFSContainerDetector.COMP_OBJ), + WORDDOCUMENT("doc", MediaType.application("msword")), + UNKNOWN("unknown", MediaType.application("x-tika-msoffice")), + ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")), + POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")), + PUBLISHER("pub", MediaType.application("x-mspublisher")), + PROJECT("mpp", MediaType.application("vnd.ms-project")), + VISIO("vsd", MediaType.application("vnd.visio")), + WORKS("wps", MediaType.application("vnd.ms-works")), + XLR("xlr", MediaType.application("x-tika-msworks-spreadsheet")), + OUTLOOK("msg", MediaType.application("vnd.ms-outlook")), + SOLIDWORKS_PART("sldprt", MediaType.application("sldworks")), + SOLIDWORKS_ASSEMBLY("sldasm", MediaType.application("sldworks")), + SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks")); + + private final String extension; + private final MediaType type; + + POIFSDocumentType(String extension, MediaType type) { + this.extension = extension; + this.type = type; + } + + public static POIFSDocumentType detectType(POIFSFileSystem fs) { + return detectType(fs.getRoot()); + } + + public static POIFSDocumentType detectType(NPOIFSFileSystem fs) { + return detectType(fs.getRoot()); + } + + public static POIFSDocumentType detectType(DirectoryEntry node) { + Set<String> names = new HashSet<String>(); + for (Entry entry : node) { + names.add(entry.getName()); + } + MediaType type = POIFSContainerDetector.detect(names, node); + for (POIFSDocumentType poifsType : values()) { + if (type.equals(poifsType.type)) { + return poifsType; + } + } + return UNKNOWN; + } + + public String getExtension() { + return extension; + } + + public MediaType getType() { + return type; + } + } + } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java Fri May 29 14:36:21 2015 @@ -37,53 +37,28 @@ import org.xml.sax.SAXException; /** * A POI-powered Tika Parser for very old versions of Excel, from - * pre-OLE2 days, such as Excel 4. + * pre-OLE2 days, such as Excel 4. */ public class OldExcelParser extends AbstractParser { - private static final long serialVersionUID = 4611820730372823452L; - - private static final Set<MediaType> SUPPORTED_TYPES = - Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( - MediaType.application("vnd.ms-excel.sheet.4"), - MediaType.application("vnd.ms-excel.workspace.4"), - MediaType.application("vnd.ms-excel.sheet.3"), - MediaType.application("vnd.ms-excel.workspace.3"), - MediaType.application("vnd.ms-excel.sheet.2") - ))); + private static final long serialVersionUID = 4611820730372823452L; - public Set<MediaType> getSupportedTypes(ParseContext context) { - return SUPPORTED_TYPES; - } + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("vnd.ms-excel.sheet.4"), + MediaType.application("vnd.ms-excel.workspace.4"), + MediaType.application("vnd.ms-excel.sheet.3"), + MediaType.application("vnd.ms-excel.workspace.3"), + MediaType.application("vnd.ms-excel.sheet.2") + ))); - /** - * Extracts properties and text from an MS Document input stream - */ - public void parse( - InputStream stream, ContentHandler handler, - Metadata metadata, ParseContext context) - throws IOException, SAXException, TikaException { - // Open the POI provided extractor - OldExcelExtractor extractor = new OldExcelExtractor(stream); - - // We can't do anything about metadata, as these old formats - // didn't have any stored with them - - // Set the content type - // TODO Get the version and type, to set as the Content Type - - // Have the text extracted and given to our Content Handler - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); - parse(extractor, xhtml); - } - - protected static void parse(OldExcelExtractor extractor, - XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException { + protected static void parse(OldExcelExtractor extractor, + XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException { // Get the whole text, as a single string String text = extractor.getText(); - + // Split and output xhtml.startDocument(); - + String line; BufferedReader reader = new BufferedReader(new StringReader(text)); while ((line = reader.readLine()) != null) { @@ -91,7 +66,32 @@ public class OldExcelParser extends Abst xhtml.characters(line); xhtml.endElement("p"); } - + xhtml.endDocument(); } + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + /** + * Extracts properties and text from an MS Document input stream + */ + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + // Open the POI provided extractor + OldExcelExtractor extractor = new OldExcelExtractor(stream); + + // We can't do anything about metadata, as these old formats + // didn't have any stored with them + + // Set the content type + // TODO Get the version and type, to set as the Content Type + + // Have the text extracted and given to our Content Handler + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + parse(extractor, xhtml); + } } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Fri May 29 14:36:21 2015 @@ -60,7 +60,7 @@ public class OutlookExtractor extends Ab public OutlookExtractor(DirectoryNode root, ParseContext context) throws TikaException { super(context); - + try { this.msg = new MAPIMessage(root); } catch (IOException e) { @@ -71,185 +71,187 @@ public class OutlookExtractor extends Ab public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException { try { - msg.setReturnNullOnMissingChunk(true); - - // If the message contains strings that aren't stored - // as Unicode, try to sort out an encoding for them - if(msg.has7BitEncodingStrings()) { - if(msg.getHeaders() != null) { - // There's normally something in the headers - msg.guess7BitEncoding(); - } else { - // Nothing in the header, try encoding detection - // on the message body - StringChunk text = msg.getMainChunks().textBodyChunk; - if(text != null) { - CharsetDetector detector = new CharsetDetector(); - detector.setText( text.getRawValue() ); - CharsetMatch match = detector.detect(); - if(match.getConfidence() > 35) { - msg.set7BitEncoding( match.getName() ); + msg.setReturnNullOnMissingChunk(true); + + // If the message contains strings that aren't stored + // as Unicode, try to sort out an encoding for them + if (msg.has7BitEncodingStrings()) { + if (msg.getHeaders() != null) { + // There's normally something in the headers + msg.guess7BitEncoding(); + } else { + // Nothing in the header, try encoding detection + // on the message body + StringChunk text = msg.getMainChunks().textBodyChunk; + if (text != null) { + CharsetDetector detector = new CharsetDetector(); + detector.setText(text.getRawValue()); + CharsetMatch match = detector.detect(); + if (match.getConfidence() > 35) { + msg.set7BitEncoding(match.getName()); + } } - } - } - } - - // Start with the metadata - String subject = msg.getSubject(); - String from = msg.getDisplayFrom(); - - metadata.set(TikaCoreProperties.CREATOR, from); - metadata.set(Metadata.MESSAGE_FROM, from); - metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo()); - metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC()); - metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC()); - - metadata.set(TikaCoreProperties.TITLE, subject); - // TODO: Move to description in Tika 2.0 - metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, - msg.getConversationTopic()); - - try { - for(String recipientAddress : msg.getRecipientEmailAddressList()) { - if(recipientAddress != null) - metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress); - } - } catch(ChunkNotFoundException he) {} // Will be fixed in POI 3.7 Final - - // Date - try two ways to find it - // First try via the proper chunk - if(msg.getMessageDate() != null) { - metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime()); - metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime()); - } else { - try { - // Failing that try via the raw headers - String[] headers = msg.getHeaders(); - if(headers != null && headers.length > 0) { - for(String header: headers) { - if(header.toLowerCase(Locale.ROOT).startsWith("date:")) { - String date = header.substring(header.indexOf(':')+1).trim(); - - // See if we can parse it as a normal mail date - try { - Date d = MboxParser.parseDate(date); - metadata.set(TikaCoreProperties.CREATED, d); - metadata.set(TikaCoreProperties.MODIFIED, d); - } catch(ParseException e) { - // Store it as-is, and hope for the best... - metadata.set(TikaCoreProperties.CREATED, date); - metadata.set(TikaCoreProperties.MODIFIED, date); + } + } + + // Start with the metadata + String subject = msg.getSubject(); + String from = msg.getDisplayFrom(); + + metadata.set(TikaCoreProperties.CREATOR, from); + metadata.set(Metadata.MESSAGE_FROM, from); + metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo()); + metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC()); + metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC()); + + metadata.set(TikaCoreProperties.TITLE, subject); + // TODO: Move to description in Tika 2.0 + metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, + msg.getConversationTopic()); + + try { + for (String recipientAddress : msg.getRecipientEmailAddressList()) { + if (recipientAddress != null) + metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress); + } + } catch (ChunkNotFoundException he) { + } // Will be fixed in POI 3.7 Final + + // Date - try two ways to find it + // First try via the proper chunk + if (msg.getMessageDate() != null) { + metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime()); + metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime()); + } else { + try { + // Failing that try via the raw headers + String[] headers = msg.getHeaders(); + if (headers != null && headers.length > 0) { + for (String header : headers) { + if (header.toLowerCase(Locale.ROOT).startsWith("date:")) { + String date = header.substring(header.indexOf(':') + 1).trim(); + + // See if we can parse it as a normal mail date + try { + Date d = MboxParser.parseDate(date); + metadata.set(TikaCoreProperties.CREATED, d); + metadata.set(TikaCoreProperties.MODIFIED, d); + } catch (ParseException e) { + // Store it as-is, and hope for the best... + metadata.set(TikaCoreProperties.CREATED, date); + metadata.set(TikaCoreProperties.MODIFIED, date); + } + break; } - break; } - } - } - } catch(ChunkNotFoundException he) { - // We can't find the date, sorry... - } - } - - - xhtml.element("h1", subject); - - // Output the from and to details in text, as you - // often want them in text form for searching - xhtml.startElement("dl"); - if (from!=null) { - header(xhtml, "From", from); - } - header(xhtml, "To", msg.getDisplayTo()); - header(xhtml, "Cc", msg.getDisplayCC()); - header(xhtml, "Bcc", msg.getDisplayBCC()); - try { - header(xhtml, "Recipients", msg.getRecipientEmailAddress()); - } catch(ChunkNotFoundException e) {} - xhtml.endElement("dl"); - - // Get the message body. Preference order is: html, rtf, text - Chunk htmlChunk = null; - Chunk rtfChunk = null; - Chunk textChunk = null; - for(Chunk chunk : msg.getMainChunks().getChunks()) { - if(chunk.getChunkId() == MAPIProperty.BODY_HTML.id) { - htmlChunk = chunk; - } - if(chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) { - rtfChunk = chunk; - } - if(chunk.getChunkId() == MAPIProperty.BODY.id) { - textChunk = chunk; - } - } - - boolean doneBody = false; - xhtml.startElement("div", "class", "message-body"); - if(htmlChunk != null) { - byte[] data = null; - if(htmlChunk instanceof ByteChunk) { - data = ((ByteChunk)htmlChunk).getValue(); - } else if(htmlChunk instanceof StringChunk) { - data = ((StringChunk)htmlChunk).getRawValue(); - } - if(data != null) { - HtmlParser htmlParser = new HtmlParser(); - htmlParser.parse( - new ByteArrayInputStream(data), - new EmbeddedContentHandler(new BodyContentHandler(xhtml)), - new Metadata(), new ParseContext() - ); - doneBody = true; - } - } - if(rtfChunk != null && !doneBody) { - ByteChunk chunk = (ByteChunk)rtfChunk; - MAPIRtfAttribute rtf = new MAPIRtfAttribute( - MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue() - ); - RTFParser rtfParser = new RTFParser(); - rtfParser.parse( - new ByteArrayInputStream(rtf.getData()), - new EmbeddedContentHandler(new BodyContentHandler(xhtml)), - new Metadata(), new ParseContext()); - doneBody = true; - } - if(textChunk != null && !doneBody) { - xhtml.element("p", ((StringChunk)textChunk).getValue()); - } - xhtml.endElement("div"); - - // Process the attachments - for (AttachmentChunks attachment : msg.getAttachmentFiles()) { - xhtml.startElement("div", "class", "attachment-entry"); - - String filename = null; - if (attachment.attachLongFileName != null) { - filename = attachment.attachLongFileName.getValue(); - } else if (attachment.attachFileName != null) { - filename = attachment.attachFileName.getValue(); - } - if (filename != null && filename.length() > 0) { - xhtml.element("h1", filename); - } - - if(attachment.attachData != null) { - handleEmbeddedResource( - TikaInputStream.get(attachment.attachData.getValue()), - filename, null, - null, xhtml, true - ); - } - if(attachment.attachmentDirectory != null) { - handleEmbeddedOfficeDoc( - attachment.attachmentDirectory.getDirectory(), - xhtml - ); - } - - xhtml.endElement("div"); - } - } catch(ChunkNotFoundException e) { - throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e); + } + } catch (ChunkNotFoundException he) { + // We can't find the date, sorry... + } + } + + + xhtml.element("h1", subject); + + // Output the from and to details in text, as you + // often want them in text form for searching + xhtml.startElement("dl"); + if (from != null) { + header(xhtml, "From", from); + } + header(xhtml, "To", msg.getDisplayTo()); + header(xhtml, "Cc", msg.getDisplayCC()); + header(xhtml, "Bcc", msg.getDisplayBCC()); + try { + header(xhtml, "Recipients", msg.getRecipientEmailAddress()); + } catch (ChunkNotFoundException e) { + } + xhtml.endElement("dl"); + + // Get the message body. Preference order is: html, rtf, text + Chunk htmlChunk = null; + Chunk rtfChunk = null; + Chunk textChunk = null; + for (Chunk chunk : msg.getMainChunks().getChunks()) { + if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) { + htmlChunk = chunk; + } + if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) { + rtfChunk = chunk; + } + if (chunk.getChunkId() == MAPIProperty.BODY.id) { + textChunk = chunk; + } + } + + boolean doneBody = false; + xhtml.startElement("div", "class", "message-body"); + if (htmlChunk != null) { + byte[] data = null; + if (htmlChunk instanceof ByteChunk) { + data = ((ByteChunk) htmlChunk).getValue(); + } else if (htmlChunk instanceof StringChunk) { + data = ((StringChunk) htmlChunk).getRawValue(); + } + if (data != null) { + HtmlParser htmlParser = new HtmlParser(); + htmlParser.parse( + new ByteArrayInputStream(data), + new EmbeddedContentHandler(new BodyContentHandler(xhtml)), + new Metadata(), new ParseContext() + ); + doneBody = true; + } + } + if (rtfChunk != null && !doneBody) { + ByteChunk chunk = (ByteChunk) rtfChunk; + MAPIRtfAttribute rtf = new MAPIRtfAttribute( + MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue() + ); + RTFParser rtfParser = new RTFParser(); + rtfParser.parse( + new ByteArrayInputStream(rtf.getData()), + new EmbeddedContentHandler(new BodyContentHandler(xhtml)), + new Metadata(), new ParseContext()); + doneBody = true; + } + if (textChunk != null && !doneBody) { + xhtml.element("p", ((StringChunk) textChunk).getValue()); + } + xhtml.endElement("div"); + + // Process the attachments + for (AttachmentChunks attachment : msg.getAttachmentFiles()) { + xhtml.startElement("div", "class", "attachment-entry"); + + String filename = null; + if (attachment.attachLongFileName != null) { + filename = attachment.attachLongFileName.getValue(); + } else if (attachment.attachFileName != null) { + filename = attachment.attachFileName.getValue(); + } + if (filename != null && filename.length() > 0) { + xhtml.element("h1", filename); + } + + if (attachment.attachData != null) { + handleEmbeddedResource( + TikaInputStream.get(attachment.attachData.getValue()), + filename, null, + null, xhtml, true + ); + } + if (attachment.attachmentDirectory != null) { + handleEmbeddedOfficeDoc( + attachment.attachmentDirectory.getDirectory(), + xhtml + ); + } + + xhtml.endElement("div"); + } + } catch (ChunkNotFoundException e) { + throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e); } } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Fri May 29 14:36:21 2015 @@ -40,162 +40,135 @@ import org.apache.tika.mime.MediaType; /** * A detector that works on a POIFS OLE2 document - * to figure out exactly what the file is. + * to figure out exactly what the file is. * This should work for all OLE2 documents, whether - * they are ones supported by POI or not. + * they are ones supported by POI or not. */ public class POIFSContainerDetector implements Detector { - /** Serial version UID */ - private static final long serialVersionUID = -3028021741663605293L; - - /** An ASCII String "StarImpress" */ - private static final byte [] STAR_IMPRESS = new byte [] { - 0x53, 0x74, 0x61, 0x72, 0x49, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73 - }; - - /** An ASCII String "StarDraw" */ - private static final byte [] STAR_DRAW = new byte [] { - 0x53, 0x74, 0x61, 0x72, 0x44, 0x72, 0x61, 0x77 - }; - - /** An ASCII String "Quill96" for Works Files */ - private static final byte [] WORKS_QUILL96 = new byte[] { - 0x51, 0x75, 0x69, 0x6c, 0x6c, 0x39, 0x36 - }; - - /** The OLE base file format */ + /** + * The OLE base file format + */ public static final MediaType OLE = application("x-tika-msoffice"); - - /** The protected OOXML base file format */ + /** + * The protected OOXML base file format + */ public static final MediaType OOXML_PROTECTED = application("x-tika-ooxml-protected"); - - /** General embedded document type within an OLE2 container */ + /** + * General embedded document type within an OLE2 container + */ public static final MediaType GENERAL_EMBEDDED = application("x-tika-msoffice-embedded"); - - /** An OLE10 Native embedded document within another OLE2 document */ + /** + * An OLE10 Native embedded document within another OLE2 document + */ public static final MediaType OLE10_NATIVE = new MediaType(GENERAL_EMBEDDED, "format", "ole10_native"); - - /** Some other kind of embedded document, in a CompObj container within another OLE2 document */ + /** + * Some other kind of embedded document, in a CompObj container within another OLE2 document + */ public static final MediaType COMP_OBJ = new MediaType(GENERAL_EMBEDDED, "format", "comp_obj"); - - /** Microsoft Excel */ + /** + * Microsoft Excel + */ public static final MediaType XLS = application("vnd.ms-excel"); - - /** Microsoft Word */ + /** + * Microsoft Word + */ public static final MediaType DOC = application("msword"); - - /** Microsoft PowerPoint */ + /** + * Microsoft PowerPoint + */ public static final MediaType PPT = application("vnd.ms-powerpoint"); - - /** Microsoft Publisher */ + /** + * Microsoft Publisher + */ public static final MediaType PUB = application("x-mspublisher"); - - /** Microsoft Visio */ + /** + * Microsoft Visio + */ public static final MediaType VSD = application("vnd.visio"); - - /** Microsoft Works */ + /** + * Microsoft Works + */ public static final MediaType WPS = application("vnd.ms-works"); - - /** Microsoft Works Spreadsheet 7.0 */ + /** + * Microsoft Works Spreadsheet 7.0 + */ public static final MediaType XLR = application("x-tika-msworks-spreadsheet"); - - /** Microsoft Outlook */ + /** + * Microsoft Outlook + */ public static final MediaType MSG = application("vnd.ms-outlook"); - - /** Microsoft Project */ + /** + * Microsoft Project + */ public static final MediaType MPP = application("vnd.ms-project"); - - /** StarOffice Calc */ + /** + * StarOffice Calc + */ public static final MediaType SDC = application("vnd.stardivision.calc"); - - /** StarOffice Draw */ + /** + * StarOffice Draw + */ public static final MediaType SDA = application("vnd.stardivision.draw"); - - /** StarOffice Impress */ + /** + * StarOffice Impress + */ public static final MediaType SDD = application("vnd.stardivision.impress"); - - /** StarOffice Writer */ + /** + * StarOffice Writer + */ public static final MediaType SDW = application("vnd.stardivision.writer"); - - /** SolidWorks CAD file */ + /** + * SolidWorks CAD file + */ public static final MediaType SLDWORKS = application("sldworks"); - - /** Regexp for matching the MPP Project Data stream */ + /** + * Serial version UID + */ + private static final long serialVersionUID = -3028021741663605293L; + /** + * An ASCII String "StarImpress" + */ + private static final byte[] STAR_IMPRESS = new byte[]{ + 0x53, 0x74, 0x61, 0x72, 0x49, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73 + }; + /** + * An ASCII String "StarDraw" + */ + private static final byte[] STAR_DRAW = new byte[]{ + 0x53, 0x74, 0x61, 0x72, 0x44, 0x72, 0x61, 0x77 + }; + /** + * An ASCII String "Quill96" for Works Files + */ + private static final byte[] WORKS_QUILL96 = new byte[]{ + 0x51, 0x75, 0x69, 0x6c, 0x6c, 0x39, 0x36 + }; + /** + * Regexp for matching the MPP Project Data stream + */ private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+"); - public MediaType detect(InputStream input, Metadata metadata) - throws IOException { - // Check if we have access to the document - if (input == null) { - return MediaType.OCTET_STREAM; - } - - // If this is a TikaInputStream wrapping an already - // parsed NPOIFileSystem/DirectoryNode, just get the - // names from the root: - TikaInputStream tis = TikaInputStream.cast(input); - Set<String> names = null; - if (tis != null) { - Object container = tis.getOpenContainer(); - if (container instanceof NPOIFSFileSystem) { - names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot()); - } else if (container instanceof DirectoryNode) { - names = getTopLevelNames((DirectoryNode) container); - } - } - - if (names == null) { - // Check if the document starts with the OLE header - input.mark(8); - try { - if (input.read() != 0xd0 || input.read() != 0xcf - || input.read() != 0x11 || input.read() != 0xe0 - || input.read() != 0xa1 || input.read() != 0xb1 - || input.read() != 0x1a || input.read() != 0xe1) { - return MediaType.OCTET_STREAM; - } - } finally { - input.reset(); - } - } - - // We can only detect the exact type when given a TikaInputStream - if (names == null && tis != null) { - // Look for known top level entry names to detect the document type - names = getTopLevelNames(tis); - } - - // Detect based on the names (as available) - if (tis != null && - tis.getOpenContainer() != null && - tis.getOpenContainer() instanceof NPOIFSFileSystem) { - return detect(names, ((NPOIFSFileSystem)tis.getOpenContainer()).getRoot()); - } else { - return detect(names, null); - } - } - /** * Internal detection of the specific kind of OLE2 document, based on the * names of the top level streams within the file. - * + * * @deprecated Use {@link #detect(Set, DirectoryEntry)} and pass the root - * entry of the filesystem whose type is to be detected, as a - * second argument. + * entry of the filesystem whose type is to be detected, as a + * second argument. */ protected static MediaType detect(Set<String> names) { return detect(names, null); } - + /** * Internal detection of the specific kind of OLE2 document, based on the * names of the top-level streams within the file. In some cases the * detection may need access to the root {@link DirectoryEntry} of that file * for best results. The entry can be given as a second, optional argument. - * + * * @param names * @param root * @return @@ -227,20 +200,20 @@ public class POIFSContainerDetector impl // This check has to be before names.contains("Workbook") // Works 7.0 spreadsheet files contain both // we want to avoid classifying this as Excel - return XLR; + return XLR; } else if (names.contains("Workbook") || names.contains("WORKBOOK")) { return XLS; } else if (names.contains("Book")) { - // Excel 95 or older, we won't be able to parse this.... - return XLS; - } else if (names.contains("EncryptedPackage") && + // Excel 95 or older, we won't be able to parse this.... + return XLS; + } else if (names.contains("EncryptedPackage") && names.contains("EncryptionInfo") && names.contains("\u0006DataSpaces")) { // This is a protected OOXML document, which is an OLE2 file // with an Encrypted Stream which holds the OOXML data // Without decrypting the stream, we can't tell what kind of // OOXML file we have. Return a general OOXML Protected type, - // and hope the name based detection can guess the rest! + // and hope the name based detection can guess the rest! return OOXML_PROTECTED; } else if (names.contains("EncryptedPackage")) { return OLE; @@ -263,33 +236,33 @@ public class POIFSContainerDetector impl } else if (names.contains("Contents") && names.contains("\u0003ObjInfo")) { return COMP_OBJ; } else if (names.contains("CONTENTS") && names.contains("\u0001CompObj")) { - // CompObj is a general kind of OLE2 embedding, but this may be an old Works file - // If we have the Directory, check - if (root != null) { - MediaType type = processCompObjFormatType(root); - if (type == WPS) { - return WPS; - } else { - // Assume it's a general CompObj embedded resource - return COMP_OBJ; - } - } else { - // Assume it's a general CompObj embedded resource - return COMP_OBJ; - } + // CompObj is a general kind of OLE2 embedding, but this may be an old Works file + // If we have the Directory, check + if (root != null) { + MediaType type = processCompObjFormatType(root); + if (type == WPS) { + return WPS; + } else { + // Assume it's a general CompObj embedded resource + return COMP_OBJ; + } + } else { + // Assume it's a general CompObj embedded resource + return COMP_OBJ; + } } else if (names.contains("CONTENTS")) { - // CONTENTS without SPELLING nor CompObj normally means some sort - // of embedded non-office file inside an OLE2 document - // This is most commonly triggered on nested directories - return OLE; + // CONTENTS without SPELLING nor CompObj normally means some sort + // of embedded non-office file inside an OLE2 document + // This is most commonly triggered on nested directories + return OLE; } else if (names.contains("\u0001CompObj") && - (names.contains("Props") || names.contains("Props9") || names.contains("Props12"))) { - // Could be Project, look for common name patterns - for (String name : names) { - if (mppDataMatch.matcher(name).matches()) { - return MPP; - } - } + (names.contains("Props") || names.contains("Props9") || names.contains("Props12"))) { + // Could be Project, look for common name patterns + for (String name : names) { + if (mppDataMatch.matcher(name).matches()) { + return MPP; + } + } } else if (names.contains("PerfectOffice_MAIN")) { if (names.contains("SlideShow")) { return MediaType.application("x-corelpresentations"); // .shw @@ -313,36 +286,36 @@ public class POIFSContainerDetector impl /** * Is this one of the kinds of formats which uses CompObj to - * store all of their data, eg Star Draw, Star Impress or - * (older) Works? + * store all of their data, eg Star Draw, Star Impress or + * (older) Works? * If not, it's likely an embedded resource */ private static MediaType processCompObjFormatType(DirectoryEntry root) { try { Entry e = root.getEntry("\u0001CompObj"); if (e != null && e.isDocumentEntry()) { - DocumentNode dn = (DocumentNode)e; + DocumentNode dn = (DocumentNode) e; DocumentInputStream stream = new DocumentInputStream(dn); - byte [] bytes = IOUtils.toByteArray(stream); + byte[] bytes = IOUtils.toByteArray(stream); /* * This array contains a string with a normal ASCII name of the * application used to create this file. We want to search for that * name. */ - if ( arrayContains(bytes, STAR_DRAW) ) { + if (arrayContains(bytes, STAR_DRAW)) { return SDA; } else if (arrayContains(bytes, STAR_IMPRESS)) { return SDD; } else if (arrayContains(bytes, WORKS_QUILL96)) { - return WPS; + return WPS; } - } + } } catch (Exception e) { /* * "root.getEntry" can throw FileNotFoundException. The code inside * "if" can throw IOExceptions. Theoretically. Practically no * exceptions will likely ever appear. - * + * * Swallow all of them. If any occur, we just assume that we can't * distinguish between Draw and Impress and return something safe: * x-tika-msoffice @@ -350,10 +323,10 @@ public class POIFSContainerDetector impl } return OLE; } - + // poor man's search for byte arrays, replace with some library call if // you know one without adding new dependencies - private static boolean arrayContains(byte [] larger, byte [] smaller) { + private static boolean arrayContains(byte[] larger, byte[] smaller) { int largerCounter = 0; int smallerCounter = 0; while (largerCounter < larger.length) { @@ -365,7 +338,7 @@ public class POIFSContainerDetector impl } } else { largerCounter = largerCounter - smallerCounter + 1; - smallerCounter=0; + smallerCounter = 0; } } return false; @@ -401,4 +374,56 @@ public class POIFSContainerDetector impl } return names; } + + public MediaType detect(InputStream input, Metadata metadata) + throws IOException { + // Check if we have access to the document + if (input == null) { + return MediaType.OCTET_STREAM; + } + + // If this is a TikaInputStream wrapping an already + // parsed NPOIFileSystem/DirectoryNode, just get the + // names from the root: + TikaInputStream tis = TikaInputStream.cast(input); + Set<String> names = null; + if (tis != null) { + Object container = tis.getOpenContainer(); + if (container instanceof NPOIFSFileSystem) { + names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot()); + } else if (container instanceof DirectoryNode) { + names = getTopLevelNames((DirectoryNode) container); + } + } + + if (names == null) { + // Check if the document starts with the OLE header + input.mark(8); + try { + if (input.read() != 0xd0 || input.read() != 0xcf + || input.read() != 0x11 || input.read() != 0xe0 + || input.read() != 0xa1 || input.read() != 0xb1 + || input.read() != 0x1a || input.read() != 0xe1) { + return MediaType.OCTET_STREAM; + } + } finally { + input.reset(); + } + } + + // We can only detect the exact type when given a TikaInputStream + if (names == null && tis != null) { + // Look for known top level entry names to detect the document type + names = getTopLevelNames(tis); + } + + // Detect based on the names (as available) + if (tis != null && + tis.getOpenContainer() != null && + tis.getOpenContainer() instanceof NPOIFSFileSystem) { + return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot()); + } else { + return detect(names, null); + } + } } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java Fri May 29 14:36:21 2015 @@ -50,10 +50,10 @@ public class SummaryExtractor { private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class); private static final String SUMMARY_INFORMATION = - SummaryInformation.DEFAULT_STREAM_NAME; + SummaryInformation.DEFAULT_STREAM_NAME; private static final String DOCUMENT_SUMMARY_INFORMATION = - DocumentSummaryInformation.DEFAULT_STREAM_NAME; + DocumentSummaryInformation.DEFAULT_STREAM_NAME; private final Metadata metadata; @@ -77,9 +77,9 @@ public class SummaryExtractor { throws IOException, TikaException { try { DocumentEntry entry = - (DocumentEntry) root.getEntry(entryName); + (DocumentEntry) root.getEntry(entryName); PropertySet properties = - new PropertySet(new DocumentInputStream(entry)); + new PropertySet(new DocumentInputStream(entry)); if (properties.isSummaryInformation()) { parse(new SummaryInformation(properties)); } @@ -115,7 +115,7 @@ public class SummaryExtractor { set(TikaCoreProperties.PRINT_DATE, summary.getLastPrinted()); set(Metadata.EDIT_TIME, summary.getEditTime()); set(OfficeOpenXMLExtended.DOC_SECURITY, summary.getSecurity()); - + // New style counts set(Office.WORD_COUNT, summary.getWordCount()); set(Office.CHARACTER_COUNT, summary.getCharCount()); @@ -123,7 +123,7 @@ public class SummaryExtractor { if (summary.getPageCount() > 0) { metadata.set(PagedText.N_PAGES, summary.getPageCount()); } - + // Old style, Tika 1.0 properties // TODO Remove these in Tika 2.0 set(Metadata.TEMPLATE, summary.getTemplate()); @@ -140,7 +140,7 @@ public class SummaryExtractor { set(OfficeOpenXMLExtended.MANAGER, summary.getManager()); set(TikaCoreProperties.LANGUAGE, getLanguage(summary)); set(OfficeOpenXMLCore.CATEGORY, summary.getCategory()); - + // New style counts set(Office.SLIDE_COUNT, summary.getSlideCount()); if (summary.getSlideCount() > 0) { @@ -152,7 +152,7 @@ public class SummaryExtractor { set(Metadata.MANAGER, summary.getManager()); set(MSOffice.SLIDE_COUNT, summary.getSlideCount()); set(Metadata.CATEGORY, summary.getCategory()); - + parse(summary.getCustomProperties()); } @@ -169,6 +169,7 @@ public class SummaryExtractor { /** * Attempt to parse custom document properties and add to the collection of metadata + * * @param customProperties */ private void parse(CustomProperties customProperties) { @@ -179,23 +180,23 @@ public class SummaryExtractor { // Get, convert and save property value Object value = customProperties.get(name); - if (value instanceof String){ - set(key, (String)value); + if (value instanceof String) { + set(key, (String) value); } else if (value instanceof Date) { Property prop = Property.externalDate(key); - metadata.set(prop, (Date)value); + metadata.set(prop, (Date) value); } else if (value instanceof Boolean) { Property prop = Property.externalBoolean(key); metadata.set(prop, value.toString()); } else if (value instanceof Long) { Property prop = Property.externalInteger(key); - metadata.set(prop, ((Long)value).intValue()); + metadata.set(prop, ((Long) value).intValue()); } else if (value instanceof Double) { Property prop = Property.externalReal(key); - metadata.set(prop, (Double)value); + metadata.set(prop, (Double) value); } else if (value instanceof Integer) { Property prop = Property.externalInteger(key); - metadata.set(prop, ((Integer)value).intValue()); + metadata.set(prop, ((Integer) value).intValue()); } } } @@ -206,7 +207,7 @@ public class SummaryExtractor { metadata.set(name, value); } } - + private void set(Property property, String value) { if (value != null) { metadata.set(property, value); Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java Fri May 29 14:36:21 2015 @@ -43,17 +43,17 @@ import org.xml.sax.SAXException; /** * A POI-powered Tika Parser for TNEF (Transport Neutral - * Encoding Format) messages, aka winmail.dat + * Encoding Format) messages, aka winmail.dat */ public class TNEFParser extends AbstractParser { - private static final long serialVersionUID = 4611820730372823452L; - - private static final Set<MediaType> SUPPORTED_TYPES = - Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( - MediaType.application("vnd.ms-tnef"), - MediaType.application("ms-tnef"), - MediaType.application("x-tnef") - ))); + private static final long serialVersionUID = 4611820730372823452L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("vnd.ms-tnef"), + MediaType.application("ms-tnef"), + MediaType.application("x-tnef") + ))); public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; @@ -66,70 +66,70 @@ public class TNEFParser extends Abstract InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - - // We work by recursing, so get the appropriate bits - EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); - EmbeddedDocumentExtractor embeddedExtractor; - if (ex==null) { - embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); - } else { - embeddedExtractor = ex; - } - - // Ask POI to process the file for us - HMEFMessage msg = new HMEFMessage(stream); - - // Set the message subject if known - String subject = msg.getSubject(); - if(subject != null && subject.length() > 0) { - // TODO: Move to title in Tika 2.0 - metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, subject); - } - - // Recurse into the message body RTF - MAPIAttribute attr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED); - if(attr != null && attr instanceof MAPIRtfAttribute) { - MAPIRtfAttribute rtf = (MAPIRtfAttribute)attr; - handleEmbedded( - "message.rtf", "application/rtf", - rtf.getData(), - embeddedExtractor, handler - ); - } - - // Recurse into each attachment in turn - for(Attachment attachment : msg.getAttachments()) { - String name = attachment.getLongFilename(); - if(name == null || name.length() == 0) { - name = attachment.getFilename(); - } - if(name == null || name.length() == 0) { - String ext = attachment.getExtension(); - if(ext != null) { - name = "unknown" + ext; - } - } - handleEmbedded( - name, null, attachment.getContents(), - embeddedExtractor, handler - ); - } + + // We work by recursing, so get the appropriate bits + EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); + EmbeddedDocumentExtractor embeddedExtractor; + if (ex == null) { + embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); + } else { + embeddedExtractor = ex; + } + + // Ask POI to process the file for us + HMEFMessage msg = new HMEFMessage(stream); + + // Set the message subject if known + String subject = msg.getSubject(); + if (subject != null && subject.length() > 0) { + // TODO: Move to title in Tika 2.0 + metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, subject); + } + + // Recurse into the message body RTF + MAPIAttribute attr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED); + if (attr != null && attr instanceof MAPIRtfAttribute) { + MAPIRtfAttribute rtf = (MAPIRtfAttribute) attr; + handleEmbedded( + "message.rtf", "application/rtf", + rtf.getData(), + embeddedExtractor, handler + ); + } + + // Recurse into each attachment in turn + for (Attachment attachment : msg.getAttachments()) { + String name = attachment.getLongFilename(); + if (name == null || name.length() == 0) { + name = attachment.getFilename(); + } + if (name == null || name.length() == 0) { + String ext = attachment.getExtension(); + if (ext != null) { + name = "unknown" + ext; + } + } + handleEmbedded( + name, null, attachment.getContents(), + embeddedExtractor, handler + ); + } } - + private void handleEmbedded(String name, String type, byte[] contents, - EmbeddedDocumentExtractor embeddedExtractor, ContentHandler handler) - throws IOException, SAXException, TikaException { - Metadata metadata = new Metadata(); - if(name != null) - metadata.set(Metadata.RESOURCE_NAME_KEY, name); - if(type != null) - metadata.set(Metadata.CONTENT_TYPE, type); - - if (embeddedExtractor.shouldParseEmbedded(metadata)) { - embeddedExtractor.parseEmbedded( - TikaInputStream.get(contents), - new EmbeddedContentHandler(handler), - metadata, false); - } + EmbeddedDocumentExtractor embeddedExtractor, ContentHandler handler) + throws IOException, SAXException, TikaException { + Metadata metadata = new Metadata(); + if (name != null) + metadata.set(Metadata.RESOURCE_NAME_KEY, name); + if (type != null) + metadata.set(Metadata.CONTENT_TYPE, type); + + if (embeddedExtractor.shouldParseEmbedded(metadata)) { + embeddedExtractor.parseEmbedded( + TikaInputStream.get(contents), + new EmbeddedContentHandler(handler), + metadata, false); + } } }
