a...

tallison Fri, 29 May 2015 07:37:22 -0700

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
 Fri May 29 14:36:21 2015
@@ -45,198 +45,198 @@ import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
 public class HSLFExtractor extends AbstractPOIFSExtractor {
-   public HSLFExtractor(ParseContext context) {
-      super(context);
-   }
-       
-   protected void parse(
-         NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
-         throws IOException, SAXException, TikaException {
-       parse(filesystem.getRoot(), xhtml);
-   }
-    
-   protected void parse(
-         DirectoryNode root, XHTMLContentHandler xhtml)
-         throws IOException, SAXException, TikaException {
-      HSLFSlideShow ss = new HSLFSlideShow(root);
-      SlideShow _show = new SlideShow(ss);
-      Slide[] _slides = _show.getSlides();
+    public HSLFExtractor(ParseContext context) {
+        super(context);
+    }
 
-      xhtml.startElement("div", "class", "slideShow");
+    protected void parse(
+            NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+        parse(filesystem.getRoot(), xhtml);
+    }
 
-      /* Iterate over slides and extract text */
-      for( Slide slide : _slides ) {
-         xhtml.startElement("div", "class", "slide");
+    protected void parse(
+            DirectoryNode root, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+        HSLFSlideShow ss = new HSLFSlideShow(root);
+        SlideShow _show = new SlideShow(ss);
+        Slide[] _slides = _show.getSlides();
 
-         // Slide header, if present
-         HeadersFooters hf = slide.getHeadersFooters();
-         if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) 
{
-            xhtml.startElement("p", "class", "slide-header");
+        xhtml.startElement("div", "class", "slideShow");
 
-            xhtml.characters( hf.getHeaderText() );
+      /* Iterate over slides and extract text */
+        for (Slide slide : _slides) {
+            xhtml.startElement("div", "class", "slide");
+
+            // Slide header, if present
+            HeadersFooters hf = slide.getHeadersFooters();
+            if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != 
null) {
+                xhtml.startElement("p", "class", "slide-header");
 
-            xhtml.endElement("p");
-         }
+                xhtml.characters(hf.getHeaderText());
 
-         // Slide master, if present
-         extractMaster(xhtml, slide.getMasterSheet());
+                xhtml.endElement("p");
+            }
 
-         // Slide text
-         {
-            xhtml.startElement("p", "class", "slide-content");
+            // Slide master, if present
+            extractMaster(xhtml, slide.getMasterSheet());
 
-            textRunsToText(xhtml, slide.getTextRuns());
+            // Slide text
+            {
+                xhtml.startElement("p", "class", "slide-content");
 
-            xhtml.endElement("p");
-         }
+                textRunsToText(xhtml, slide.getTextRuns());
 
-         // Table text
-         for (Shape shape: slide.getShapes()){
-            if (shape instanceof Table){
-               extractTableText(xhtml, (Table)shape);
+                xhtml.endElement("p");
             }
-         }
 
-         // Slide footer, if present
-         if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) 
{
-            xhtml.startElement("p", "class", "slide-footer");
+            // Table text
+            for (Shape shape : slide.getShapes()) {
+                if (shape instanceof Table) {
+                    extractTableText(xhtml, (Table) shape);
+                }
+            }
 
-            xhtml.characters( hf.getFooterText() );
+            // Slide footer, if present
+            if (hf != null && hf.isFooterVisible() && hf.getFooterText() != 
null) {
+                xhtml.startElement("p", "class", "slide-footer");
 
-            xhtml.endElement("p");
-         }
+                xhtml.characters(hf.getFooterText());
 
-         // Comments, if present
-         for( Comment comment : slide.getComments() ) {
-            xhtml.startElement("p", "class", "slide-comment");
-            if (comment.getAuthor() != null) {
-               xhtml.startElement("b");
-               xhtml.characters( comment.getAuthor() );
-               xhtml.endElement("b");
-               
-               if (comment.getText() != null) {
-                  xhtml.characters( " - ");
-               }
+                xhtml.endElement("p");
             }
-            if (comment.getText() != null) {
-               xhtml.characters( comment.getText() );
+
+            // Comments, if present
+            for (Comment comment : slide.getComments()) {
+                xhtml.startElement("p", "class", "slide-comment");
+                if (comment.getAuthor() != null) {
+                    xhtml.startElement("b");
+                    xhtml.characters(comment.getAuthor());
+                    xhtml.endElement("b");
+
+                    if (comment.getText() != null) {
+                        xhtml.characters(" - ");
+                    }
+                }
+                if (comment.getText() != null) {
+                    xhtml.characters(comment.getText());
+                }
+                xhtml.endElement("p");
             }
-            xhtml.endElement("p");
-         }
 
-         // Now any embedded resources
-         handleSlideEmbeddedResources(slide, xhtml);
+            // Now any embedded resources
+            handleSlideEmbeddedResources(slide, xhtml);
 
-         // TODO Find the Notes for this slide and extract inline
+            // TODO Find the Notes for this slide and extract inline
 
-         // Slide complete
-         xhtml.endElement("div");
-      }
+            // Slide complete
+            xhtml.endElement("div");
+        }
 
-      // All slides done
-      xhtml.endElement("div");
+        // All slides done
+        xhtml.endElement("div");
 
       /* notes */
-      xhtml.startElement("div", "class", "slideNotes");
-      HashSet<Integer> seenNotes = new HashSet<Integer>();
-      HeadersFooters hf = _show.getNotesHeadersFooters();
-
-      for (Slide slide : _slides) {
-         Notes notes = slide.getNotesSheet();
-         if (notes == null) {
-            continue;
-         }
-         Integer id = notes._getSheetNumber();
-         if (seenNotes.contains(id)) {
-            continue;
-         }
-         seenNotes.add(id);
-
-         // Repeat the Notes header, if set
-         if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) 
{
-            xhtml.startElement("p", "class", "slide-note-header");
-            xhtml.characters( hf.getHeaderText() );
-            xhtml.endElement("p");
-         }
-
-         // Notes text
-         textRunsToText(xhtml, notes.getTextRuns());
-
-         // Repeat the notes footer, if set
-         if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) 
{
-            xhtml.startElement("p", "class", "slide-note-footer");
-            xhtml.characters( hf.getFooterText() );
-            xhtml.endElement("p");
-         }
-      }
-
-      handleSlideEmbeddedPictures(_show, xhtml);
-
-      xhtml.endElement("div");
-   }
-
-   private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master) 
throws SAXException {
-      if (master == null){
-         return;
-      }
-      Shape[] shapes = master.getShapes();
-      if (shapes == null || shapes.length == 0){
-         return;
-      }
-
-      xhtml.startElement("div", "class", "slide-master-content");
-      for (Shape shape : shapes){
-         if (shape != null && ! MasterSheet.isPlaceholder(shape)){
-            if (shape instanceof TextShape){
-               TextShape tsh = (TextShape)shape;
-               String text = tsh.getText();
-               if (text != null){
-                  xhtml.element("p", text);
-               }
-            }
-         }
-      }
-      xhtml.endElement("div");
-   }
-
-   private void extractTableText(XHTMLContentHandler xhtml, Table shape) 
throws SAXException {
-      xhtml.startElement("table");
-      for (int row = 0; row < shape.getNumberOfRows(); row++){
-         xhtml.startElement("tr");
-         for (int col = 0; col < shape.getNumberOfColumns(); col++){
-            TableCell cell = shape.getCell(row, col);
-            //insert empty string for empty cell if cell is null
-            String txt = "";
-            if (cell != null){
-               txt = cell.getText();
-            }
-            xhtml.element("td", txt);
-         }
-         xhtml.endElement("tr");
-      }
-      xhtml.endElement("table");   
-   }
-
-   private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs) 
throws SAXException {
-      if (runs==null) {
-         return;
-      }
-
-      for (TextRun run : runs) {
-         if (run != null) {
-           // Leaving in wisdom from TIKA-712 for easy revert.
-           // Avoid boiler-plate text on the master slide (0
-           // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
-           //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 
1)) {
-           String txt = run.getText();
-           if (txt != null){
-               xhtml.characters(txt);
-               xhtml.startElement("br");
-               xhtml.endElement("br");
-           }
-         }
-      }
-   }
+        xhtml.startElement("div", "class", "slideNotes");
+        HashSet<Integer> seenNotes = new HashSet<Integer>();
+        HeadersFooters hf = _show.getNotesHeadersFooters();
+
+        for (Slide slide : _slides) {
+            Notes notes = slide.getNotesSheet();
+            if (notes == null) {
+                continue;
+            }
+            Integer id = notes._getSheetNumber();
+            if (seenNotes.contains(id)) {
+                continue;
+            }
+            seenNotes.add(id);
+
+            // Repeat the Notes header, if set
+            if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != 
null) {
+                xhtml.startElement("p", "class", "slide-note-header");
+                xhtml.characters(hf.getHeaderText());
+                xhtml.endElement("p");
+            }
+
+            // Notes text
+            textRunsToText(xhtml, notes.getTextRuns());
+
+            // Repeat the notes footer, if set
+            if (hf != null && hf.isFooterVisible() && hf.getFooterText() != 
null) {
+                xhtml.startElement("p", "class", "slide-note-footer");
+                xhtml.characters(hf.getFooterText());
+                xhtml.endElement("p");
+            }
+        }
+
+        handleSlideEmbeddedPictures(_show, xhtml);
+
+        xhtml.endElement("div");
+    }
+
+    private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master) 
throws SAXException {
+        if (master == null) {
+            return;
+        }
+        Shape[] shapes = master.getShapes();
+        if (shapes == null || shapes.length == 0) {
+            return;
+        }
+
+        xhtml.startElement("div", "class", "slide-master-content");
+        for (Shape shape : shapes) {
+            if (shape != null && !MasterSheet.isPlaceholder(shape)) {
+                if (shape instanceof TextShape) {
+                    TextShape tsh = (TextShape) shape;
+                    String text = tsh.getText();
+                    if (text != null) {
+                        xhtml.element("p", text);
+                    }
+                }
+            }
+        }
+        xhtml.endElement("div");
+    }
+
+    private void extractTableText(XHTMLContentHandler xhtml, Table shape) 
throws SAXException {
+        xhtml.startElement("table");
+        for (int row = 0; row < shape.getNumberOfRows(); row++) {
+            xhtml.startElement("tr");
+            for (int col = 0; col < shape.getNumberOfColumns(); col++) {
+                TableCell cell = shape.getCell(row, col);
+                //insert empty string for empty cell if cell is null
+                String txt = "";
+                if (cell != null) {
+                    txt = cell.getText();
+                }
+                xhtml.element("td", txt);
+            }
+            xhtml.endElement("tr");
+        }
+        xhtml.endElement("table");
+    }
+
+    private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs) 
throws SAXException {
+        if (runs == null) {
+            return;
+        }
+
+        for (TextRun run : runs) {
+            if (run != null) {
+                // Leaving in wisdom from TIKA-712 for easy revert.
+                // Avoid boiler-plate text on the master slide (0
+                // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
+                //if (!isMaster || (run.getRunType() != 0 && run.getRunType() 
!= 1)) {
+                String txt = run.getText();
+                if (txt != null) {
+                    xhtml.characters(txt);
+                    xhtml.startElement("br");
+                    xhtml.endElement("br");
+                }
+            }
+        }
+    }
 
     private void handleSlideEmbeddedPictures(SlideShow slideshow, 
XHTMLContentHandler xhtml)
             throws TikaException, SAXException, IOException {
@@ -262,60 +262,60 @@ public class HSLFExtractor extends Abstr
             }
 
             handleEmbeddedResource(
-                  TikaInputStream.get(pic.getData()), null, null,
-                  mediaType, xhtml, false);
+                    TikaInputStream.get(pic.getData()), null, null,
+                    mediaType, xhtml, false);
         }
     }
 
     private void handleSlideEmbeddedResources(Slide slide, XHTMLContentHandler 
xhtml)
-                throws TikaException, SAXException, IOException {
-      Shape[] shapes;
-      try {
-         shapes = slide.getShapes();
-      } catch(NullPointerException e) {
-         // Sometimes HSLF hits problems
-         // Please open POI bugs for any you come across!
-         return;
-      }
-      
-      for( Shape shape : shapes ) {
-         if( shape instanceof OLEShape ) {
-            OLEShape oleShape = (OLEShape)shape;
-            ObjectData data = null;
-            try {
-                data = oleShape.getObjectData();
-            } catch( NullPointerException e ) { 
+            throws TikaException, SAXException, IOException {
+        Shape[] shapes;
+        try {
+            shapes = slide.getShapes();
+        } catch (NullPointerException e) {
+            // Sometimes HSLF hits problems
+            // Please open POI bugs for any you come across!
+            return;
+        }
+
+        for (Shape shape : shapes) {
+            if (shape instanceof OLEShape) {
+                OLEShape oleShape = (OLEShape) shape;
+                ObjectData data = null;
+                try {
+                    data = oleShape.getObjectData();
+                } catch (NullPointerException e) {
                 /* getObjectData throws NPE some times. */
+                }
+
+                if (data != null) {
+                    String objID = Integer.toString(oleShape.getObjectID());
+
+                    // Embedded Object: add a <div
+                    // class="embedded" id="X"/> so consumer can see where
+                    // in the main text each embedded document
+                    // occurred:
+                    AttributesImpl attributes = new AttributesImpl();
+                    attributes.addAttribute("", "class", "class", "CDATA", 
"embedded");
+                    attributes.addAttribute("", "id", "id", "CDATA", objID);
+                    xhtml.startElement("div", attributes);
+                    xhtml.endElement("div");
+
+                    TikaInputStream stream =
+                            TikaInputStream.get(data.getData());
+                    try {
+                        String mediaType = null;
+                        if ("Excel.Chart.8".equals(oleShape.getProgID())) {
+                            mediaType = "application/vnd.ms-excel";
+                        }
+                        handleEmbeddedResource(
+                                stream, objID, objID,
+                                mediaType, xhtml, false);
+                    } finally {
+                        stream.close();
+                    }
+                }
             }
- 
-            if (data != null) {
-               String objID = Integer.toString(oleShape.getObjectID());
-
-               // Embedded Object: add a <div
-               // class="embedded" id="X"/> so consumer can see where
-               // in the main text each embedded document
-               // occurred:
-               AttributesImpl attributes = new AttributesImpl();
-               attributes.addAttribute("", "class", "class", "CDATA", 
"embedded");
-               attributes.addAttribute("", "id", "id", "CDATA", objID);
-               xhtml.startElement("div", attributes);
-               xhtml.endElement("div");
-
-               TikaInputStream stream =
-                    TikaInputStream.get(data.getData());
-               try {
-                  String mediaType = null;
-                  if ("Excel.Chart.8".equals(oleShape.getProgID())) {
-                     mediaType = "application/vnd.ms-excel";
-                  }
-                  handleEmbeddedResource(
-                        stream, objID, objID,
-                        mediaType, xhtml, false);
-               } finally {
-                  stream.close();
-               }
-            }
-         }
-      }
-   }
+        }
+    }
 }


Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
 Fri May 29 14:36:21 2015
@@ -71,7 +71,7 @@ public class ListManager extends Abstrac
             ListData listData = 
listTables.getListData(paragraph.getList().getLsid());
             LevelTuple[] levelTuples = new 
LevelTuple[listData.getLevels().length];
             for (int i = 0; i < listData.getLevels().length; i++) {
-                levelTuples[i] = buildTuple(i,listData.getLevels()[i]);
+                levelTuples[i] = buildTuple(i, listData.getLevels()[i]);
             }
             lc = new ParagraphLevelCounter(levelTuples);
         }
@@ -89,7 +89,7 @@ public class ListManager extends Abstrac
         boolean isLegal = false;
         int start = 1;
         int restart = -1;
-        String lvlText = "%"+i+".";
+        String lvlText = "%" + i + ".";
         String numFmt = "decimal";
 
         start = listLevel.getStartAt();
@@ -127,18 +127,18 @@ public class ListManager extends Abstrac
 
         StringBuilder sb = new StringBuilder();
         int last = 0;
-        for (int i = 0; i < numberOffsets.length;i++) {
-            int offset = (int)numberOffsets[i];
+        for (int i = 0; i < numberOffsets.length; i++) {
+            int offset = (int) numberOffsets[i];
 
-            if (offset == 0){
+            if (offset == 0) {
                 break;
             }
-            sb.append(numberText.substring(last, offset-1));
+            sb.append(numberText.substring(last, offset - 1));
             //need to add one because newer format
             //adds one.  In .doc, this was the array index;
             //but in .docx, this is the level number
-            int lvlNum = (int)numberText.charAt(offset-1)+1;
-            sb.append("%"+lvlNum);
+            int lvlNum = (int) numberText.charAt(offset - 1) + 1;
+            sb.append("%" + lvlNum);
             last = offset;
         }
         if (last < numberText.length()) {
@@ -149,29 +149,29 @@ public class ListManager extends Abstrac
 
     private String convertToNewNumFormat(int numberFormat) {
         switch (numberFormat) {
-            case -1 :
+            case -1:
                 return "none";
-            case 0 :
+            case 0:
                 return "decimal";
-            case 1 :
+            case 1:
                 return "upperRoman";
-            case 2 :
+            case 2:
                 return "lowerRoman";
-            case 3 :
+            case 3:
                 return "upperLetter";
-            case 4 :
+            case 4:
                 return "lowerLetter";
-            case 5 :
+            case 5:
                 return "ordinal";
-            case 22 :
+            case 22:
                 return "decimalZero";
-            case 23 :
+            case 23:
                 return "bullet";
-            case 47 :
+            case 47:
                 return "none";
-            default :
+            default:
                 //do we really want to silently swallow these uncovered cases?
-                throw new RuntimeException("NOT COVERED: "+numberFormat);
+                throw new RuntimeException("NOT COVERED: " + numberFormat);
         }
     }
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 Fri May 29 14:36:21 2015
@@ -55,7 +55,9 @@ import org.xml.sax.SAXException;
  */
 public class OfficeParser extends AbstractParser {
 
-    /** Serial version UID */
+    /**
+     * Serial version UID
+     */
     private static final long serialVersionUID = 7393462244028653479L;
 
     private static final Set<MediaType> SUPPORTED_TYPES =
@@ -75,64 +77,7 @@ public class OfficeParser extends Abstra
                     POIFSDocumentType.SOLIDWORKS_PART.type,
                     POIFSDocumentType.SOLIDWORKS_ASSEMBLY.type,
                     POIFSDocumentType.SOLIDWORKS_DRAWING.type
-                    )));
-
-    public enum POIFSDocumentType {
-        WORKBOOK("xls", MediaType.application("vnd.ms-excel")),
-        OLE10_NATIVE("ole", POIFSContainerDetector.OLE10_NATIVE),
-        COMP_OBJ("ole", POIFSContainerDetector.COMP_OBJ),
-        WORDDOCUMENT("doc", MediaType.application("msword")),
-        UNKNOWN("unknown", MediaType.application("x-tika-msoffice")),
-        ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")),
-        POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")),
-        PUBLISHER("pub", MediaType.application("x-mspublisher")),
-        PROJECT("mpp", MediaType.application("vnd.ms-project")),
-        VISIO("vsd", MediaType.application("vnd.visio")),
-        WORKS("wps", MediaType.application("vnd.ms-works")),
-        XLR("xlr", MediaType.application("x-tika-msworks-spreadsheet")),
-        OUTLOOK("msg", MediaType.application("vnd.ms-outlook")),
-        SOLIDWORKS_PART("sldprt", MediaType.application("sldworks")),
-        SOLIDWORKS_ASSEMBLY("sldasm", MediaType.application("sldworks")),
-        SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks"));
-
-        private final String extension;
-        private final MediaType type;
-
-        POIFSDocumentType(String extension, MediaType type) {
-            this.extension = extension;
-            this.type = type;
-        }
-
-        public String getExtension() {
-            return extension;
-        }
-
-        public MediaType getType() {
-            return type;
-        }
-
-        public static POIFSDocumentType detectType(POIFSFileSystem fs) {
-            return detectType(fs.getRoot());
-        }
-
-        public static POIFSDocumentType detectType(NPOIFSFileSystem fs) {
-           return detectType(fs.getRoot());
-       }
-
-        public static POIFSDocumentType detectType(DirectoryEntry node) {
-            Set<String> names = new HashSet<String>();
-            for (Entry entry : node) {
-                names.add(entry.getName());
-            }
-            MediaType type = POIFSContainerDetector.detect(names, node);
-            for (POIFSDocumentType poifsType : values()) {
-               if (type.equals(poifsType.type)) {
-                  return poifsType;
-               }
-            }
-            return UNKNOWN;
-        }
-    }
+            )));
 
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
@@ -183,84 +128,84 @@ public class OfficeParser extends Abstra
         // Parse remaining document entries
         POIFSDocumentType type = POIFSDocumentType.detectType(root);
 
-        if (type!=POIFSDocumentType.UNKNOWN) {
+        if (type != POIFSDocumentType.UNKNOWN) {
             setType(metadata, type.getType());
         }
 
         switch (type) {
-        case SOLIDWORKS_PART:
-        case SOLIDWORKS_ASSEMBLY:
-        case SOLIDWORKS_DRAWING:
-               break;
-        case PUBLISHER:
-           PublisherTextExtractor publisherTextExtractor =
-              new PublisherTextExtractor(root);
-           xhtml.element("p", publisherTextExtractor.getText());
-           break;
-        case WORDDOCUMENT:
-           new WordExtractor(context).parse(root, xhtml);
-           break;
-        case POWERPOINT:
-           new HSLFExtractor(context).parse(root, xhtml);
-           break;
-        case WORKBOOK:
-        case XLR:
-           Locale locale = context.get(Locale.class, Locale.getDefault());
-           new ExcelExtractor(context, metadata).parse(root, xhtml, locale);
-           break;
-        case PROJECT:
-           // We currently can't do anything beyond the metadata
-           break;
-        case VISIO:
-           VisioTextExtractor visioTextExtractor =
-              new VisioTextExtractor(root);
-           for (String text : visioTextExtractor.getAllText()) {
-              xhtml.element("p", text);
-           }
-           break;
-        case OUTLOOK:
-           OutlookExtractor extractor =
-                 new OutlookExtractor(root, context);
-
-           extractor.parse(xhtml, metadata);
-           break;
-        case ENCRYPTED:
-           EncryptionInfo info = new EncryptionInfo(root);
-           Decryptor d = Decryptor.getInstance(info);
-
-           try {
-              // By default, use the default Office Password
-              String password = Decryptor.DEFAULT_PASSWORD;
-              
-              // If they supplied a Password Provider, ask that for the 
password,
-              //  and use the provider given one if available (stick with 
default if not)
-              PasswordProvider passwordProvider = 
context.get(PasswordProvider.class);
-              if (passwordProvider != null) {
-                 String suppliedPassword = 
passwordProvider.getPassword(metadata);
-                 if (suppliedPassword != null) {
-                     password = suppliedPassword;
-                 }
-              }
-              
-              // Check if we've the right password or not
-              if (!d.verifyPassword(password)) {
-                 throw new EncryptedDocumentException();
-              }
-
-              // Decrypt the OLE2 stream, and delegate the resulting OOXML
-              //  file to the regular OOXML parser for normal handling
-              OOXMLParser parser = new OOXMLParser();
-
-              parser.parse(d.getDataStream(root), new EmbeddedContentHandler(
-                    new BodyContentHandler(xhtml)),
-                    metadata, context);
-           } catch (GeneralSecurityException ex) {
-              throw new EncryptedDocumentException(ex);
-           }
-        default:
-            // For unsupported / unhandled types, just the metadata
-            //  is extracted, which happened above
-            break;
+            case SOLIDWORKS_PART:
+            case SOLIDWORKS_ASSEMBLY:
+            case SOLIDWORKS_DRAWING:
+                break;
+            case PUBLISHER:
+                PublisherTextExtractor publisherTextExtractor =
+                        new PublisherTextExtractor(root);
+                xhtml.element("p", publisherTextExtractor.getText());
+                break;
+            case WORDDOCUMENT:
+                new WordExtractor(context).parse(root, xhtml);
+                break;
+            case POWERPOINT:
+                new HSLFExtractor(context).parse(root, xhtml);
+                break;
+            case WORKBOOK:
+            case XLR:
+                Locale locale = context.get(Locale.class, Locale.getDefault());
+                new ExcelExtractor(context, metadata).parse(root, xhtml, 
locale);
+                break;
+            case PROJECT:
+                // We currently can't do anything beyond the metadata
+                break;
+            case VISIO:
+                VisioTextExtractor visioTextExtractor =
+                        new VisioTextExtractor(root);
+                for (String text : visioTextExtractor.getAllText()) {
+                    xhtml.element("p", text);
+                }
+                break;
+            case OUTLOOK:
+                OutlookExtractor extractor =
+                        new OutlookExtractor(root, context);
+
+                extractor.parse(xhtml, metadata);
+                break;
+            case ENCRYPTED:
+                EncryptionInfo info = new EncryptionInfo(root);
+                Decryptor d = Decryptor.getInstance(info);
+
+                try {
+                    // By default, use the default Office Password
+                    String password = Decryptor.DEFAULT_PASSWORD;
+
+                    // If they supplied a Password Provider, ask that for the 
password,
+                    //  and use the provider given one if available (stick 
with default if not)
+                    PasswordProvider passwordProvider = 
context.get(PasswordProvider.class);
+                    if (passwordProvider != null) {
+                        String suppliedPassword = 
passwordProvider.getPassword(metadata);
+                        if (suppliedPassword != null) {
+                            password = suppliedPassword;
+                        }
+                    }
+
+                    // Check if we've the right password or not
+                    if (!d.verifyPassword(password)) {
+                        throw new EncryptedDocumentException();
+                    }
+
+                    // Decrypt the OLE2 stream, and delegate the resulting 
OOXML
+                    //  file to the regular OOXML parser for normal handling
+                    OOXMLParser parser = new OOXMLParser();
+
+                    parser.parse(d.getDataStream(root), new 
EmbeddedContentHandler(
+                                    new BodyContentHandler(xhtml)),
+                            metadata, context);
+                } catch (GeneralSecurityException ex) {
+                    throw new EncryptedDocumentException(ex);
+                }
+            default:
+                // For unsupported / unhandled types, just the metadata
+                //  is extracted, which happened above
+                break;
         }
     }
 
@@ -268,4 +213,61 @@ public class OfficeParser extends Abstra
         metadata.set(Metadata.CONTENT_TYPE, type.toString());
     }
 
+    public enum POIFSDocumentType {
+        WORKBOOK("xls", MediaType.application("vnd.ms-excel")),
+        OLE10_NATIVE("ole", POIFSContainerDetector.OLE10_NATIVE),
+        COMP_OBJ("ole", POIFSContainerDetector.COMP_OBJ),
+        WORDDOCUMENT("doc", MediaType.application("msword")),
+        UNKNOWN("unknown", MediaType.application("x-tika-msoffice")),
+        ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")),
+        POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")),
+        PUBLISHER("pub", MediaType.application("x-mspublisher")),
+        PROJECT("mpp", MediaType.application("vnd.ms-project")),
+        VISIO("vsd", MediaType.application("vnd.visio")),
+        WORKS("wps", MediaType.application("vnd.ms-works")),
+        XLR("xlr", MediaType.application("x-tika-msworks-spreadsheet")),
+        OUTLOOK("msg", MediaType.application("vnd.ms-outlook")),
+        SOLIDWORKS_PART("sldprt", MediaType.application("sldworks")),
+        SOLIDWORKS_ASSEMBLY("sldasm", MediaType.application("sldworks")),
+        SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks"));
+
+        private final String extension;
+        private final MediaType type;
+
+        POIFSDocumentType(String extension, MediaType type) {
+            this.extension = extension;
+            this.type = type;
+        }
+
+        public static POIFSDocumentType detectType(POIFSFileSystem fs) {
+            return detectType(fs.getRoot());
+        }
+
+        public static POIFSDocumentType detectType(NPOIFSFileSystem fs) {
+            return detectType(fs.getRoot());
+        }
+
+        public static POIFSDocumentType detectType(DirectoryEntry node) {
+            Set<String> names = new HashSet<String>();
+            for (Entry entry : node) {
+                names.add(entry.getName());
+            }
+            MediaType type = POIFSContainerDetector.detect(names, node);
+            for (POIFSDocumentType poifsType : values()) {
+                if (type.equals(poifsType.type)) {
+                    return poifsType;
+                }
+            }
+            return UNKNOWN;
+        }
+
+        public String getExtension() {
+            return extension;
+        }
+
+        public MediaType getType() {
+            return type;
+        }
+    }
+
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
 Fri May 29 14:36:21 2015
@@ -37,53 +37,28 @@ import org.xml.sax.SAXException;
 
 /**
  * A POI-powered Tika Parser for very old versions of Excel, from
- *  pre-OLE2 days, such as Excel 4.
+ * pre-OLE2 days, such as Excel 4.
  */
 public class OldExcelParser extends AbstractParser {
-   private static final long serialVersionUID = 4611820730372823452L;
-   
-   private static final Set<MediaType> SUPPORTED_TYPES =
-        Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-              MediaType.application("vnd.ms-excel.sheet.4"),
-              MediaType.application("vnd.ms-excel.workspace.4"),
-              MediaType.application("vnd.ms-excel.sheet.3"),
-              MediaType.application("vnd.ms-excel.workspace.3"),
-              MediaType.application("vnd.ms-excel.sheet.2")
-         )));
+    private static final long serialVersionUID = 4611820730372823452L;
 
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("vnd.ms-excel.sheet.4"),
+                    MediaType.application("vnd.ms-excel.workspace.4"),
+                    MediaType.application("vnd.ms-excel.sheet.3"),
+                    MediaType.application("vnd.ms-excel.workspace.3"),
+                    MediaType.application("vnd.ms-excel.sheet.2")
+            )));
 
-    /**
-     * Extracts properties and text from an MS Document input stream
-     */
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-       // Open the POI provided extractor
-       OldExcelExtractor extractor = new OldExcelExtractor(stream);
-       
-       // We can't do anything about metadata, as these old formats
-       //  didn't have any stored with them
-       
-       // Set the content type
-       // TODO Get the version and type, to set as the Content Type
-       
-       // Have the text extracted and given to our Content Handler
-       XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-       parse(extractor, xhtml);
-    }
-    
-    protected static void parse(OldExcelExtractor extractor, 
-            XHTMLContentHandler xhtml) throws TikaException, IOException, 
SAXException {
+    protected static void parse(OldExcelExtractor extractor,
+                                XHTMLContentHandler xhtml) throws 
TikaException, IOException, SAXException {
         // Get the whole text, as a single string
         String text = extractor.getText();
-        
+
         // Split and output
         xhtml.startDocument();
-        
+
         String line;
         BufferedReader reader = new BufferedReader(new StringReader(text));
         while ((line = reader.readLine()) != null) {
@@ -91,7 +66,32 @@ public class OldExcelParser extends Abst
             xhtml.characters(line);
             xhtml.endElement("p");
         }
-        
+
         xhtml.endDocument();
     }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * Extracts properties and text from an MS Document input stream
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // Open the POI provided extractor
+        OldExcelExtractor extractor = new OldExcelExtractor(stream);
+
+        // We can't do anything about metadata, as these old formats
+        //  didn't have any stored with them
+
+        // Set the content type
+        // TODO Get the version and type, to set as the Content Type
+
+        // Have the text extracted and given to our Content Handler
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        parse(extractor, xhtml);
+    }
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 Fri May 29 14:36:21 2015
@@ -60,7 +60,7 @@ public class OutlookExtractor extends Ab
 
     public OutlookExtractor(DirectoryNode root, ParseContext context) throws 
TikaException {
         super(context);
-        
+
         try {
             this.msg = new MAPIMessage(root);
         } catch (IOException e) {
@@ -71,185 +71,187 @@ public class OutlookExtractor extends Ab
     public void parse(XHTMLContentHandler xhtml, Metadata metadata)
             throws TikaException, SAXException, IOException {
         try {
-           msg.setReturnNullOnMissingChunk(true);
-           
-           // If the message contains strings that aren't stored
-           //  as Unicode, try to sort out an encoding for them
-           if(msg.has7BitEncodingStrings()) {
-              if(msg.getHeaders() != null) {
-                 // There's normally something in the headers
-                 msg.guess7BitEncoding();
-              } else {
-                 // Nothing in the header, try encoding detection
-                 //  on the message body
-                 StringChunk text = msg.getMainChunks().textBodyChunk; 
-                 if(text != null) {
-                    CharsetDetector detector = new CharsetDetector();
-                    detector.setText( text.getRawValue() );
-                    CharsetMatch match = detector.detect();
-                    if(match.getConfidence() > 35) {
-                       msg.set7BitEncoding( match.getName() );
+            msg.setReturnNullOnMissingChunk(true);
+
+            // If the message contains strings that aren't stored
+            //  as Unicode, try to sort out an encoding for them
+            if (msg.has7BitEncodingStrings()) {
+                if (msg.getHeaders() != null) {
+                    // There's normally something in the headers
+                    msg.guess7BitEncoding();
+                } else {
+                    // Nothing in the header, try encoding detection
+                    //  on the message body
+                    StringChunk text = msg.getMainChunks().textBodyChunk;
+                    if (text != null) {
+                        CharsetDetector detector = new CharsetDetector();
+                        detector.setText(text.getRawValue());
+                        CharsetMatch match = detector.detect();
+                        if (match.getConfidence() > 35) {
+                            msg.set7BitEncoding(match.getName());
+                        }
                     }
-                 }
-              }
-           }
-           
-           // Start with the metadata
-           String subject = msg.getSubject();
-           String from = msg.getDisplayFrom();
-   
-           metadata.set(TikaCoreProperties.CREATOR, from);
-           metadata.set(Metadata.MESSAGE_FROM, from);
-           metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
-           metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
-           metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
-           
-           metadata.set(TikaCoreProperties.TITLE, subject);
-           // TODO: Move to description in Tika 2.0
-           
metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, 
-                   msg.getConversationTopic());
-           
-           try {
-           for(String recipientAddress : msg.getRecipientEmailAddressList()) {
-               if(recipientAddress != null)
-                  metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, 
recipientAddress);
-           }
-           } catch(ChunkNotFoundException he) {} // Will be fixed in POI 3.7 
Final
-           
-           // Date - try two ways to find it
-           // First try via the proper chunk
-           if(msg.getMessageDate() != null) {
-              metadata.set(TikaCoreProperties.CREATED, 
msg.getMessageDate().getTime());
-              metadata.set(TikaCoreProperties.MODIFIED, 
msg.getMessageDate().getTime());
-           } else {
-              try {
-                 // Failing that try via the raw headers 
-                 String[] headers = msg.getHeaders();
-                 if(headers != null && headers.length > 0) {
-                     for(String header: headers) {
-                        
if(header.toLowerCase(Locale.ROOT).startsWith("date:")) {
-                            String date = 
header.substring(header.indexOf(':')+1).trim();
-                            
-                            // See if we can parse it as a normal mail date
-                            try {
-                               Date d = MboxParser.parseDate(date);
-                               metadata.set(TikaCoreProperties.CREATED, d);
-                               metadata.set(TikaCoreProperties.MODIFIED, d);
-                            } catch(ParseException e) {
-                               // Store it as-is, and hope for the best...
-                               metadata.set(TikaCoreProperties.CREATED, date);
-                               metadata.set(TikaCoreProperties.MODIFIED, date);
+                }
+            }
+
+            // Start with the metadata
+            String subject = msg.getSubject();
+            String from = msg.getDisplayFrom();
+
+            metadata.set(TikaCoreProperties.CREATOR, from);
+            metadata.set(Metadata.MESSAGE_FROM, from);
+            metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
+            metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
+            metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
+
+            metadata.set(TikaCoreProperties.TITLE, subject);
+            // TODO: Move to description in Tika 2.0
+            
metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
+                    msg.getConversationTopic());
+
+            try {
+                for (String recipientAddress : 
msg.getRecipientEmailAddressList()) {
+                    if (recipientAddress != null)
+                        metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, 
recipientAddress);
+                }
+            } catch (ChunkNotFoundException he) {
+            } // Will be fixed in POI 3.7 Final
+
+            // Date - try two ways to find it
+            // First try via the proper chunk
+            if (msg.getMessageDate() != null) {
+                metadata.set(TikaCoreProperties.CREATED, 
msg.getMessageDate().getTime());
+                metadata.set(TikaCoreProperties.MODIFIED, 
msg.getMessageDate().getTime());
+            } else {
+                try {
+                    // Failing that try via the raw headers
+                    String[] headers = msg.getHeaders();
+                    if (headers != null && headers.length > 0) {
+                        for (String header : headers) {
+                            if 
(header.toLowerCase(Locale.ROOT).startsWith("date:")) {
+                                String date = 
header.substring(header.indexOf(':') + 1).trim();
+
+                                // See if we can parse it as a normal mail date
+                                try {
+                                    Date d = MboxParser.parseDate(date);
+                                    metadata.set(TikaCoreProperties.CREATED, 
d);
+                                    metadata.set(TikaCoreProperties.MODIFIED, 
d);
+                                } catch (ParseException e) {
+                                    // Store it as-is, and hope for the best...
+                                    metadata.set(TikaCoreProperties.CREATED, 
date);
+                                    metadata.set(TikaCoreProperties.MODIFIED, 
date);
+                                }
+                                break;
                             }
-                            break;
                         }
-                     }
-                 }
-              } catch(ChunkNotFoundException he) {
-                 // We can't find the date, sorry...
-              }
-           }
-           
-   
-           xhtml.element("h1", subject);
-   
-           // Output the from and to details in text, as you
-           //  often want them in text form for searching
-           xhtml.startElement("dl");
-           if (from!=null) {
-               header(xhtml, "From", from);
-           }
-           header(xhtml, "To", msg.getDisplayTo());
-           header(xhtml, "Cc", msg.getDisplayCC());
-           header(xhtml, "Bcc", msg.getDisplayBCC());
-           try {
-               header(xhtml, "Recipients", msg.getRecipientEmailAddress());
-           } catch(ChunkNotFoundException e) {}
-           xhtml.endElement("dl");
-   
-           // Get the message body. Preference order is: html, rtf, text
-           Chunk htmlChunk = null;
-           Chunk rtfChunk = null;
-           Chunk textChunk = null;
-           for(Chunk chunk : msg.getMainChunks().getChunks()) {
-              if(chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
-                 htmlChunk = chunk;
-              }
-              if(chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
-                 rtfChunk = chunk;
-              }
-              if(chunk.getChunkId() == MAPIProperty.BODY.id) {
-                 textChunk = chunk;
-              }
-           }
-           
-           boolean doneBody = false;
-           xhtml.startElement("div", "class", "message-body");
-           if(htmlChunk != null) {
-              byte[] data = null;
-              if(htmlChunk instanceof ByteChunk) {
-                 data = ((ByteChunk)htmlChunk).getValue();
-              } else if(htmlChunk instanceof StringChunk) {
-                 data = ((StringChunk)htmlChunk).getRawValue();
-              }
-              if(data != null) {
-                 HtmlParser htmlParser = new HtmlParser();
-                 htmlParser.parse(
-                       new ByteArrayInputStream(data),
-                       new EmbeddedContentHandler(new 
BodyContentHandler(xhtml)), 
-                       new Metadata(), new ParseContext()
-                 );
-                 doneBody = true;
-              }
-           }
-           if(rtfChunk != null && !doneBody) {
-              ByteChunk chunk = (ByteChunk)rtfChunk;
-              MAPIRtfAttribute rtf = new MAPIRtfAttribute(
-                    MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), 
chunk.getValue()
-              );
-              RTFParser rtfParser = new RTFParser();
-              rtfParser.parse(
-                              new ByteArrayInputStream(rtf.getData()),
-                              new EmbeddedContentHandler(new 
BodyContentHandler(xhtml)),
-                              new Metadata(), new ParseContext());
-              doneBody = true;
-           }
-           if(textChunk != null && !doneBody) {
-              xhtml.element("p", ((StringChunk)textChunk).getValue());
-           }
-           xhtml.endElement("div");
-           
-           // Process the attachments
-           for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
-               xhtml.startElement("div", "class", "attachment-entry");
-               
-               String filename = null;
-               if (attachment.attachLongFileName != null) {
-                  filename = attachment.attachLongFileName.getValue();
-               } else if (attachment.attachFileName != null) {
-                  filename = attachment.attachFileName.getValue();
-               }
-               if (filename != null && filename.length() > 0) {
-                   xhtml.element("h1", filename);
-               }
-               
-               if(attachment.attachData != null) {
-                  handleEmbeddedResource(
-                        TikaInputStream.get(attachment.attachData.getValue()),
-                        filename, null,
-                        null, xhtml, true
-                  );
-               }
-               if(attachment.attachmentDirectory != null) {
-                  handleEmbeddedOfficeDoc(
-                        attachment.attachmentDirectory.getDirectory(),
-                        xhtml
-                  );
-               }
-
-               xhtml.endElement("div");
-           }
-        } catch(ChunkNotFoundException e) {
-           throw new TikaException("POI MAPIMessage broken - didn't return 
null on missing chunk", e);
+                    }
+                } catch (ChunkNotFoundException he) {
+                    // We can't find the date, sorry...
+                }
+            }
+
+
+            xhtml.element("h1", subject);
+
+            // Output the from and to details in text, as you
+            //  often want them in text form for searching
+            xhtml.startElement("dl");
+            if (from != null) {
+                header(xhtml, "From", from);
+            }
+            header(xhtml, "To", msg.getDisplayTo());
+            header(xhtml, "Cc", msg.getDisplayCC());
+            header(xhtml, "Bcc", msg.getDisplayBCC());
+            try {
+                header(xhtml, "Recipients", msg.getRecipientEmailAddress());
+            } catch (ChunkNotFoundException e) {
+            }
+            xhtml.endElement("dl");
+
+            // Get the message body. Preference order is: html, rtf, text
+            Chunk htmlChunk = null;
+            Chunk rtfChunk = null;
+            Chunk textChunk = null;
+            for (Chunk chunk : msg.getMainChunks().getChunks()) {
+                if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
+                    htmlChunk = chunk;
+                }
+                if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
+                    rtfChunk = chunk;
+                }
+                if (chunk.getChunkId() == MAPIProperty.BODY.id) {
+                    textChunk = chunk;
+                }
+            }
+
+            boolean doneBody = false;
+            xhtml.startElement("div", "class", "message-body");
+            if (htmlChunk != null) {
+                byte[] data = null;
+                if (htmlChunk instanceof ByteChunk) {
+                    data = ((ByteChunk) htmlChunk).getValue();
+                } else if (htmlChunk instanceof StringChunk) {
+                    data = ((StringChunk) htmlChunk).getRawValue();
+                }
+                if (data != null) {
+                    HtmlParser htmlParser = new HtmlParser();
+                    htmlParser.parse(
+                            new ByteArrayInputStream(data),
+                            new EmbeddedContentHandler(new 
BodyContentHandler(xhtml)),
+                            new Metadata(), new ParseContext()
+                    );
+                    doneBody = true;
+                }
+            }
+            if (rtfChunk != null && !doneBody) {
+                ByteChunk chunk = (ByteChunk) rtfChunk;
+                MAPIRtfAttribute rtf = new MAPIRtfAttribute(
+                        MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), 
chunk.getValue()
+                );
+                RTFParser rtfParser = new RTFParser();
+                rtfParser.parse(
+                        new ByteArrayInputStream(rtf.getData()),
+                        new EmbeddedContentHandler(new 
BodyContentHandler(xhtml)),
+                        new Metadata(), new ParseContext());
+                doneBody = true;
+            }
+            if (textChunk != null && !doneBody) {
+                xhtml.element("p", ((StringChunk) textChunk).getValue());
+            }
+            xhtml.endElement("div");
+
+            // Process the attachments
+            for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
+                xhtml.startElement("div", "class", "attachment-entry");
+
+                String filename = null;
+                if (attachment.attachLongFileName != null) {
+                    filename = attachment.attachLongFileName.getValue();
+                } else if (attachment.attachFileName != null) {
+                    filename = attachment.attachFileName.getValue();
+                }
+                if (filename != null && filename.length() > 0) {
+                    xhtml.element("h1", filename);
+                }
+
+                if (attachment.attachData != null) {
+                    handleEmbeddedResource(
+                            
TikaInputStream.get(attachment.attachData.getValue()),
+                            filename, null,
+                            null, xhtml, true
+                    );
+                }
+                if (attachment.attachmentDirectory != null) {
+                    handleEmbeddedOfficeDoc(
+                            attachment.attachmentDirectory.getDirectory(),
+                            xhtml
+                    );
+                }
+
+                xhtml.endElement("div");
+            }
+        } catch (ChunkNotFoundException e) {
+            throw new TikaException("POI MAPIMessage broken - didn't return 
null on missing chunk", e);
         }
     }
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
 Fri May 29 14:36:21 2015
@@ -40,162 +40,135 @@ import org.apache.tika.mime.MediaType;
 
 /**
  * A detector that works on a POIFS OLE2 document
- *  to figure out exactly what the file is.
+ * to figure out exactly what the file is.
  * This should work for all OLE2 documents, whether
- *  they are ones supported by POI or not.
+ * they are ones supported by POI or not.
  */
 public class POIFSContainerDetector implements Detector {
 
-    /** Serial version UID */
-    private static final long serialVersionUID = -3028021741663605293L;
-    
-    /** An ASCII String "StarImpress" */
-    private static final byte [] STAR_IMPRESS = new byte [] {
-        0x53, 0x74, 0x61, 0x72, 0x49, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73
-    };
-    
-    /** An ASCII String "StarDraw" */
-    private static final byte [] STAR_DRAW = new byte [] {
-        0x53, 0x74, 0x61, 0x72, 0x44, 0x72, 0x61, 0x77
-    };
-    
-    /** An ASCII String "Quill96" for Works Files */
-    private static final byte [] WORKS_QUILL96 = new byte[] {
-        0x51, 0x75, 0x69, 0x6c, 0x6c, 0x39, 0x36
-    };
-
-    /** The OLE base file format */
+    /**
+     * The OLE base file format
+     */
     public static final MediaType OLE = application("x-tika-msoffice");
-    
-    /** The protected OOXML base file format */
+    /**
+     * The protected OOXML base file format
+     */
     public static final MediaType OOXML_PROTECTED = 
application("x-tika-ooxml-protected");
-    
-    /** General embedded document type within an OLE2 container */
+    /**
+     * General embedded document type within an OLE2 container
+     */
     public static final MediaType GENERAL_EMBEDDED = 
application("x-tika-msoffice-embedded");
-    
-    /** An OLE10 Native embedded document within another OLE2 document */
+    /**
+     * An OLE10 Native embedded document within another OLE2 document
+     */
     public static final MediaType OLE10_NATIVE =
             new MediaType(GENERAL_EMBEDDED, "format", "ole10_native");
-    
-    /** Some other kind of embedded document, in a CompObj container within 
another OLE2 document */
+    /**
+     * Some other kind of embedded document, in a CompObj container within 
another OLE2 document
+     */
     public static final MediaType COMP_OBJ =
             new MediaType(GENERAL_EMBEDDED, "format", "comp_obj");
-
-    /** Microsoft Excel */
+    /**
+     * Microsoft Excel
+     */
     public static final MediaType XLS = application("vnd.ms-excel");
-
-    /** Microsoft Word */
+    /**
+     * Microsoft Word
+     */
     public static final MediaType DOC = application("msword");
-
-    /** Microsoft PowerPoint */
+    /**
+     * Microsoft PowerPoint
+     */
     public static final MediaType PPT = application("vnd.ms-powerpoint");
-
-    /** Microsoft Publisher */
+    /**
+     * Microsoft Publisher
+     */
     public static final MediaType PUB = application("x-mspublisher");
-
-    /** Microsoft Visio */
+    /**
+     * Microsoft Visio
+     */
     public static final MediaType VSD = application("vnd.visio");
-
-    /** Microsoft Works */
+    /**
+     * Microsoft Works
+     */
     public static final MediaType WPS = application("vnd.ms-works");
-    
-    /** Microsoft Works Spreadsheet 7.0 */
+    /**
+     * Microsoft Works Spreadsheet 7.0
+     */
     public static final MediaType XLR = 
application("x-tika-msworks-spreadsheet");
-
-    /** Microsoft Outlook */
+    /**
+     * Microsoft Outlook
+     */
     public static final MediaType MSG = application("vnd.ms-outlook");
-    
-    /** Microsoft Project */
+    /**
+     * Microsoft Project
+     */
     public static final MediaType MPP = application("vnd.ms-project");
-    
-    /** StarOffice Calc */
+    /**
+     * StarOffice Calc
+     */
     public static final MediaType SDC = application("vnd.stardivision.calc");
-    
-    /** StarOffice Draw */
+    /**
+     * StarOffice Draw
+     */
     public static final MediaType SDA = application("vnd.stardivision.draw");
-    
-    /** StarOffice Impress */
+    /**
+     * StarOffice Impress
+     */
     public static final MediaType SDD = 
application("vnd.stardivision.impress");
-    
-    /** StarOffice Writer */
+    /**
+     * StarOffice Writer
+     */
     public static final MediaType SDW = application("vnd.stardivision.writer");
-
-    /** SolidWorks CAD file */
+    /**
+     * SolidWorks CAD file
+     */
     public static final MediaType SLDWORKS = application("sldworks");
-
-    /** Regexp for matching the MPP Project Data stream */
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -3028021741663605293L;
+    /**
+     * An ASCII String "StarImpress"
+     */
+    private static final byte[] STAR_IMPRESS = new byte[]{
+            0x53, 0x74, 0x61, 0x72, 0x49, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73
+    };
+    /**
+     * An ASCII String "StarDraw"
+     */
+    private static final byte[] STAR_DRAW = new byte[]{
+            0x53, 0x74, 0x61, 0x72, 0x44, 0x72, 0x61, 0x77
+    };
+    /**
+     * An ASCII String "Quill96" for Works Files
+     */
+    private static final byte[] WORKS_QUILL96 = new byte[]{
+            0x51, 0x75, 0x69, 0x6c, 0x6c, 0x39, 0x36
+    };
+    /**
+     * Regexp for matching the MPP Project Data stream
+     */
     private static final Pattern mppDataMatch = 
Pattern.compile("\\s\\s\\s\\d+");
 
-    public MediaType detect(InputStream input, Metadata metadata)
-             throws IOException {
-        // Check if we have access to the document
-        if (input == null) {
-            return MediaType.OCTET_STREAM;
-        }
-
-        // If this is a TikaInputStream wrapping an already
-        // parsed NPOIFileSystem/DirectoryNode, just get the
-        // names from the root:
-        TikaInputStream tis = TikaInputStream.cast(input);
-        Set<String> names = null;
-        if (tis != null) {
-            Object container = tis.getOpenContainer();
-            if (container instanceof NPOIFSFileSystem) {
-                names = getTopLevelNames(((NPOIFSFileSystem) 
container).getRoot());
-            } else if (container instanceof DirectoryNode) {
-                names = getTopLevelNames((DirectoryNode) container);
-            }
-        }
-
-        if (names == null) {
-            // Check if the document starts with the OLE header
-            input.mark(8);
-            try {
-                if (input.read() != 0xd0 || input.read() != 0xcf
-                    || input.read() != 0x11 || input.read() != 0xe0
-                    || input.read() != 0xa1 || input.read() != 0xb1
-                    || input.read() != 0x1a || input.read() != 0xe1) {
-                    return MediaType.OCTET_STREAM;
-                }
-            } finally {
-                input.reset();
-            }
-        }
-
-        // We can only detect the exact type when given a TikaInputStream
-        if (names == null && tis != null) {
-            // Look for known top level entry names to detect the document type
-            names = getTopLevelNames(tis);
-        }
-        
-        // Detect based on the names (as available)
-        if (tis != null && 
-            tis.getOpenContainer() != null && 
-            tis.getOpenContainer() instanceof NPOIFSFileSystem) {
-            return detect(names, 
((NPOIFSFileSystem)tis.getOpenContainer()).getRoot());
-        } else {
-            return detect(names, null);
-        }
-    }
-
     /**
      * Internal detection of the specific kind of OLE2 document, based on the
      * names of the top level streams within the file.
-     * 
+     *
      * @deprecated Use {@link #detect(Set, DirectoryEntry)} and pass the root
-     *             entry of the filesystem whose type is to be detected, as a
-     *             second argument.
+     * entry of the filesystem whose type is to be detected, as a
+     * second argument.
      */
     protected static MediaType detect(Set<String> names) {
         return detect(names, null);
     }
-    
+
     /**
      * Internal detection of the specific kind of OLE2 document, based on the
      * names of the top-level streams within the file. In some cases the
      * detection may need access to the root {@link DirectoryEntry} of that 
file
      * for best results. The entry can be given as a second, optional argument.
-     * 
+     *
      * @param names
      * @param root
      * @return
@@ -227,20 +200,20 @@ public class POIFSContainerDetector impl
                 // This check has to be before names.contains("Workbook")
                 // Works 7.0 spreadsheet files contain both
                 // we want to avoid classifying this as Excel
-                return XLR; 
+                return XLR;
             } else if (names.contains("Workbook") || 
names.contains("WORKBOOK")) {
                 return XLS;
             } else if (names.contains("Book")) {
-               // Excel 95 or older, we won't be able to parse this....
-               return XLS;
-            } else if (names.contains("EncryptedPackage") && 
+                // Excel 95 or older, we won't be able to parse this....
+                return XLS;
+            } else if (names.contains("EncryptedPackage") &&
                     names.contains("EncryptionInfo") &&
                     names.contains("\u0006DataSpaces")) {
                 // This is a protected OOXML document, which is an OLE2 file
                 //  with an Encrypted Stream which holds the OOXML data
                 // Without decrypting the stream, we can't tell what kind of
                 //  OOXML file we have. Return a general OOXML Protected type,
-                //  and hope the name based detection can guess the rest! 
+                //  and hope the name based detection can guess the rest!
                 return OOXML_PROTECTED;
             } else if (names.contains("EncryptedPackage")) {
                 return OLE;
@@ -263,33 +236,33 @@ public class POIFSContainerDetector impl
             } else if (names.contains("Contents") && 
names.contains("\u0003ObjInfo")) {
                 return COMP_OBJ;
             } else if (names.contains("CONTENTS") && 
names.contains("\u0001CompObj")) {
-               // CompObj is a general kind of OLE2 embedding, but this may be 
an old Works file
-               // If we have the Directory, check
-               if (root != null) {
-                  MediaType type = processCompObjFormatType(root);
-                  if (type == WPS) {
-                     return WPS;
-                  } else {
-                     // Assume it's a general CompObj embedded resource
-                     return COMP_OBJ;
-                  }
-               } else {
-                  // Assume it's a general CompObj embedded resource
-                  return COMP_OBJ;
-               }
+                // CompObj is a general kind of OLE2 embedding, but this may 
be an old Works file
+                // If we have the Directory, check
+                if (root != null) {
+                    MediaType type = processCompObjFormatType(root);
+                    if (type == WPS) {
+                        return WPS;
+                    } else {
+                        // Assume it's a general CompObj embedded resource
+                        return COMP_OBJ;
+                    }
+                } else {
+                    // Assume it's a general CompObj embedded resource
+                    return COMP_OBJ;
+                }
             } else if (names.contains("CONTENTS")) {
-               // CONTENTS without SPELLING nor CompObj normally means some 
sort
-               //  of embedded non-office file inside an OLE2 document
-               // This is most commonly triggered on nested directories
-               return OLE;
+                // CONTENTS without SPELLING nor CompObj normally means some 
sort
+                //  of embedded non-office file inside an OLE2 document
+                // This is most commonly triggered on nested directories
+                return OLE;
             } else if (names.contains("\u0001CompObj") &&
-                  (names.contains("Props") || names.contains("Props9") || 
names.contains("Props12"))) {
-               // Could be Project, look for common name patterns
-               for (String name : names) {
-                  if (mppDataMatch.matcher(name).matches()) {
-                     return MPP;
-                  }
-               }
+                    (names.contains("Props") || names.contains("Props9") || 
names.contains("Props12"))) {
+                // Could be Project, look for common name patterns
+                for (String name : names) {
+                    if (mppDataMatch.matcher(name).matches()) {
+                        return MPP;
+                    }
+                }
             } else if (names.contains("PerfectOffice_MAIN")) {
                 if (names.contains("SlideShow")) {
                     return MediaType.application("x-corelpresentations"); // 
.shw
@@ -313,36 +286,36 @@ public class POIFSContainerDetector impl
 
     /**
      * Is this one of the kinds of formats which uses CompObj to
-     *  store all of their data, eg Star Draw, Star Impress or
-     *  (older) Works?
+     * store all of their data, eg Star Draw, Star Impress or
+     * (older) Works?
      * If not, it's likely an embedded resource
      */
     private static MediaType processCompObjFormatType(DirectoryEntry root) {
         try {
             Entry e = root.getEntry("\u0001CompObj");
             if (e != null && e.isDocumentEntry()) {
-                DocumentNode dn = (DocumentNode)e;
+                DocumentNode dn = (DocumentNode) e;
                 DocumentInputStream stream = new DocumentInputStream(dn);
-                byte [] bytes = IOUtils.toByteArray(stream);
+                byte[] bytes = IOUtils.toByteArray(stream);
                 /*
                  * This array contains a string with a normal ASCII name of the
                  * application used to create this file. We want to search for 
that
                  * name.
                  */
-                if ( arrayContains(bytes, STAR_DRAW) ) {
+                if (arrayContains(bytes, STAR_DRAW)) {
                     return SDA;
                 } else if (arrayContains(bytes, STAR_IMPRESS)) {
                     return SDD;
                 } else if (arrayContains(bytes, WORKS_QUILL96)) {
-                   return WPS;
+                    return WPS;
                 }
-            } 
+            }
         } catch (Exception e) {
             /*
              * "root.getEntry" can throw FileNotFoundException. The code inside
              * "if" can throw IOExceptions. Theoretically. Practically no
              * exceptions will likely ever appear.
-             * 
+             *
              * Swallow all of them. If any occur, we just assume that we can't
              * distinguish between Draw and Impress and return something safe:
              * x-tika-msoffice
@@ -350,10 +323,10 @@ public class POIFSContainerDetector impl
         }
         return OLE;
     }
-    
+
     // poor man's search for byte arrays, replace with some library call if
     // you know one without adding new dependencies
-    private static boolean arrayContains(byte [] larger, byte [] smaller) {
+    private static boolean arrayContains(byte[] larger, byte[] smaller) {
         int largerCounter = 0;
         int smallerCounter = 0;
         while (largerCounter < larger.length) {
@@ -365,7 +338,7 @@ public class POIFSContainerDetector impl
                 }
             } else {
                 largerCounter = largerCounter - smallerCounter + 1;
-                smallerCounter=0;
+                smallerCounter = 0;
             }
         }
         return false;
@@ -401,4 +374,56 @@ public class POIFSContainerDetector impl
         }
         return names;
     }
+
+    public MediaType detect(InputStream input, Metadata metadata)
+            throws IOException {
+        // Check if we have access to the document
+        if (input == null) {
+            return MediaType.OCTET_STREAM;
+        }
+
+        // If this is a TikaInputStream wrapping an already
+        // parsed NPOIFileSystem/DirectoryNode, just get the
+        // names from the root:
+        TikaInputStream tis = TikaInputStream.cast(input);
+        Set<String> names = null;
+        if (tis != null) {
+            Object container = tis.getOpenContainer();
+            if (container instanceof NPOIFSFileSystem) {
+                names = getTopLevelNames(((NPOIFSFileSystem) 
container).getRoot());
+            } else if (container instanceof DirectoryNode) {
+                names = getTopLevelNames((DirectoryNode) container);
+            }
+        }
+
+        if (names == null) {
+            // Check if the document starts with the OLE header
+            input.mark(8);
+            try {
+                if (input.read() != 0xd0 || input.read() != 0xcf
+                        || input.read() != 0x11 || input.read() != 0xe0
+                        || input.read() != 0xa1 || input.read() != 0xb1
+                        || input.read() != 0x1a || input.read() != 0xe1) {
+                    return MediaType.OCTET_STREAM;
+                }
+            } finally {
+                input.reset();
+            }
+        }
+
+        // We can only detect the exact type when given a TikaInputStream
+        if (names == null && tis != null) {
+            // Look for known top level entry names to detect the document type
+            names = getTopLevelNames(tis);
+        }
+
+        // Detect based on the names (as available)
+        if (tis != null &&
+                tis.getOpenContainer() != null &&
+                tis.getOpenContainer() instanceof NPOIFSFileSystem) {
+            return detect(names, ((NPOIFSFileSystem) 
tis.getOpenContainer()).getRoot());
+        } else {
+            return detect(names, null);
+        }
+    }
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
 Fri May 29 14:36:21 2015
@@ -50,10 +50,10 @@ public class SummaryExtractor {
     private static final Log logger = 
LogFactory.getLog(AbstractPOIFSExtractor.class);
 
     private static final String SUMMARY_INFORMATION =
-        SummaryInformation.DEFAULT_STREAM_NAME;
+            SummaryInformation.DEFAULT_STREAM_NAME;
 
     private static final String DOCUMENT_SUMMARY_INFORMATION =
-        DocumentSummaryInformation.DEFAULT_STREAM_NAME;
+            DocumentSummaryInformation.DEFAULT_STREAM_NAME;
 
     private final Metadata metadata;
 
@@ -77,9 +77,9 @@ public class SummaryExtractor {
             throws IOException, TikaException {
         try {
             DocumentEntry entry =
-                (DocumentEntry) root.getEntry(entryName);
+                    (DocumentEntry) root.getEntry(entryName);
             PropertySet properties =
-                new PropertySet(new DocumentInputStream(entry));
+                    new PropertySet(new DocumentInputStream(entry));
             if (properties.isSummaryInformation()) {
                 parse(new SummaryInformation(properties));
             }
@@ -115,7 +115,7 @@ public class SummaryExtractor {
         set(TikaCoreProperties.PRINT_DATE, summary.getLastPrinted());
         set(Metadata.EDIT_TIME, summary.getEditTime());
         set(OfficeOpenXMLExtended.DOC_SECURITY, summary.getSecurity());
-        
+
         // New style counts
         set(Office.WORD_COUNT, summary.getWordCount());
         set(Office.CHARACTER_COUNT, summary.getCharCount());
@@ -123,7 +123,7 @@ public class SummaryExtractor {
         if (summary.getPageCount() > 0) {
             metadata.set(PagedText.N_PAGES, summary.getPageCount());
         }
-        
+
         // Old style, Tika 1.0 properties
         // TODO Remove these in Tika 2.0
         set(Metadata.TEMPLATE, summary.getTemplate());
@@ -140,7 +140,7 @@ public class SummaryExtractor {
         set(OfficeOpenXMLExtended.MANAGER, summary.getManager());
         set(TikaCoreProperties.LANGUAGE, getLanguage(summary));
         set(OfficeOpenXMLCore.CATEGORY, summary.getCategory());
-        
+
         // New style counts
         set(Office.SLIDE_COUNT, summary.getSlideCount());
         if (summary.getSlideCount() > 0) {
@@ -152,7 +152,7 @@ public class SummaryExtractor {
         set(Metadata.MANAGER, summary.getManager());
         set(MSOffice.SLIDE_COUNT, summary.getSlideCount());
         set(Metadata.CATEGORY, summary.getCategory());
-        
+
         parse(summary.getCustomProperties());
     }
 
@@ -169,6 +169,7 @@ public class SummaryExtractor {
 
     /**
      * Attempt to parse custom document properties and add to the collection 
of metadata
+     *
      * @param customProperties
      */
     private void parse(CustomProperties customProperties) {
@@ -179,23 +180,23 @@ public class SummaryExtractor {
 
                 // Get, convert and save property value
                 Object value = customProperties.get(name);
-                if (value instanceof String){
-                    set(key, (String)value);
+                if (value instanceof String) {
+                    set(key, (String) value);
                 } else if (value instanceof Date) {
                     Property prop = Property.externalDate(key);
-                    metadata.set(prop, (Date)value);
+                    metadata.set(prop, (Date) value);
                 } else if (value instanceof Boolean) {
                     Property prop = Property.externalBoolean(key);
                     metadata.set(prop, value.toString());
                 } else if (value instanceof Long) {
                     Property prop = Property.externalInteger(key);
-                    metadata.set(prop, ((Long)value).intValue());
+                    metadata.set(prop, ((Long) value).intValue());
                 } else if (value instanceof Double) {
                     Property prop = Property.externalReal(key);
-                    metadata.set(prop, (Double)value);
+                    metadata.set(prop, (Double) value);
                 } else if (value instanceof Integer) {
                     Property prop = Property.externalInteger(key);
-                    metadata.set(prop, ((Integer)value).intValue());
+                    metadata.set(prop, ((Integer) value).intValue());
                 }
             }
         }
@@ -206,7 +207,7 @@ public class SummaryExtractor {
             metadata.set(name, value);
         }
     }
-    
+
     private void set(Property property, String value) {
         if (value != null) {
             metadata.set(property, value);

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
 Fri May 29 14:36:21 2015
@@ -43,17 +43,17 @@ import org.xml.sax.SAXException;
 
 /**
  * A POI-powered Tika Parser for TNEF (Transport Neutral
- *  Encoding Format) messages, aka winmail.dat
+ * Encoding Format) messages, aka winmail.dat
  */
 public class TNEFParser extends AbstractParser {
-   private static final long serialVersionUID = 4611820730372823452L;
-   
-   private static final Set<MediaType> SUPPORTED_TYPES =
-        Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-              MediaType.application("vnd.ms-tnef"),
-              MediaType.application("ms-tnef"),
-              MediaType.application("x-tnef")
-         )));
+    private static final long serialVersionUID = 4611820730372823452L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("vnd.ms-tnef"),
+                    MediaType.application("ms-tnef"),
+                    MediaType.application("x-tnef")
+            )));
 
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
@@ -66,70 +66,70 @@ public class TNEFParser extends Abstract
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-       
-       // We work by recursing, so get the appropriate bits 
-       EmbeddedDocumentExtractor ex = 
context.get(EmbeddedDocumentExtractor.class);
-       EmbeddedDocumentExtractor embeddedExtractor;
-       if (ex==null) {
-           embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
-       } else {
-           embeddedExtractor = ex;
-       }
-       
-       // Ask POI to process the file for us
-       HMEFMessage msg = new HMEFMessage(stream);
-       
-       // Set the message subject if known
-       String subject = msg.getSubject();
-       if(subject != null && subject.length() > 0) {
-          // TODO: Move to title in Tika 2.0
-          metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, 
subject);
-       }
-       
-       // Recurse into the message body RTF
-       MAPIAttribute attr = 
msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED);
-       if(attr != null && attr instanceof MAPIRtfAttribute) {
-          MAPIRtfAttribute rtf = (MAPIRtfAttribute)attr;
-          handleEmbedded(
-                "message.rtf", "application/rtf",
-                rtf.getData(),
-                embeddedExtractor, handler
-          );
-       }
-       
-       // Recurse into each attachment in turn
-       for(Attachment attachment : msg.getAttachments()) {
-          String name = attachment.getLongFilename();
-          if(name == null || name.length() == 0) {
-             name = attachment.getFilename();
-          }
-          if(name == null || name.length() == 0) {
-             String ext = attachment.getExtension();
-             if(ext != null) {
-                name = "unknown" + ext;
-             }
-          }
-          handleEmbedded(
-                name, null, attachment.getContents(),
-                embeddedExtractor, handler
-          );
-       }
+
+        // We work by recursing, so get the appropriate bits
+        EmbeddedDocumentExtractor ex = 
context.get(EmbeddedDocumentExtractor.class);
+        EmbeddedDocumentExtractor embeddedExtractor;
+        if (ex == null) {
+            embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+        } else {
+            embeddedExtractor = ex;
+        }
+
+        // Ask POI to process the file for us
+        HMEFMessage msg = new HMEFMessage(stream);
+
+        // Set the message subject if known
+        String subject = msg.getSubject();
+        if (subject != null && subject.length() > 0) {
+            // TODO: Move to title in Tika 2.0
+            metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, 
subject);
+        }
+
+        // Recurse into the message body RTF
+        MAPIAttribute attr = 
msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED);
+        if (attr != null && attr instanceof MAPIRtfAttribute) {
+            MAPIRtfAttribute rtf = (MAPIRtfAttribute) attr;
+            handleEmbedded(
+                    "message.rtf", "application/rtf",
+                    rtf.getData(),
+                    embeddedExtractor, handler
+            );
+        }
+
+        // Recurse into each attachment in turn
+        for (Attachment attachment : msg.getAttachments()) {
+            String name = attachment.getLongFilename();
+            if (name == null || name.length() == 0) {
+                name = attachment.getFilename();
+            }
+            if (name == null || name.length() == 0) {
+                String ext = attachment.getExtension();
+                if (ext != null) {
+                    name = "unknown" + ext;
+                }
+            }
+            handleEmbedded(
+                    name, null, attachment.getContents(),
+                    embeddedExtractor, handler
+            );
+        }
     }
-    
+
     private void handleEmbedded(String name, String type, byte[] contents,
-          EmbeddedDocumentExtractor embeddedExtractor, ContentHandler handler)
-          throws IOException, SAXException, TikaException {
-       Metadata metadata = new Metadata();
-       if(name != null)
-          metadata.set(Metadata.RESOURCE_NAME_KEY, name);
-       if(type != null)
-          metadata.set(Metadata.CONTENT_TYPE, type);
-
-       if (embeddedExtractor.shouldParseEmbedded(metadata)) {
-         embeddedExtractor.parseEmbedded(
-                 TikaInputStream.get(contents),
-                 new EmbeddedContentHandler(handler),
-                 metadata, false);
-       }
+                                EmbeddedDocumentExtractor embeddedExtractor, 
ContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        Metadata metadata = new Metadata();
+        if (name != null)
+            metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+        if (type != null)
+            metadata.set(Metadata.CONTENT_TYPE, type);
+
+        if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+            embeddedExtractor.parseEmbedded(
+                    TikaInputStream.get(contents),
+                    new EmbeddedContentHandler(handler),
+                    metadata, false);
+        }
     }
 }

svn commit: r1682489 [3/14] - in /tika/trunk: tika-parsers/src/main/java/org/apache/tika/parser/html/ tika-parsers/src/main/java/org/apache/tika/parser/image/ tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/ tika-parsers/src/main/java/org/a...

Reply via email to