This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit b2a462c6d2e48d525543c1132671100fc7ff0e6f
Author: tballison <[email protected]>
AuthorDate: Mon Feb 27 13:34:43 2017 -0500

    TIKA 2276 -- cleanup
---
 .../tika/extractor/EmbeddedDocumentUtil.java       | 67 +++++++++++++++++-----
 .../java/org/apache/tika/parser/chm/ChmParser.java |  2 +-
 .../tika/parser/microsoft/JackcessExtractor.java   |  9 +--
 .../tika/parser/microsoft/OutlookExtractor.java    | 15 +++--
 4 files changed, 71 insertions(+), 22 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
index a23c6da..3c3531e 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
@@ -35,6 +35,7 @@ import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.utils.ExceptionUtils;
 import org.xml.sax.ContentHandler;
@@ -92,33 +93,73 @@ public class EmbeddedDocumentUtil implements Serializable {
 
     /**
      * Tries to find an existing parser within the ParseContext.
-     * Initially tries to find first child parser specific to that mediaType.
-     * Then backs off to the overall parser.
+     * It looks inside of CompositeParsers and ParserDecorators.
+     * The use case is when a parser needs to parse an internal stream
+     * that is _part_ of the document, e.g. rtf body inside an msg.
+     * <p/>
      * Can return <code>null</code> if the context contains no parser or
-     * if an appropriate parser can't be found.
+     * the correct parser can't be found.
      *
-     * @param mediaType
+     * @param clazz parser class to search for
      * @param context
      * @return
      */
-    public static Parser tryToFindExistingParser(MediaType mediaType, 
ParseContext context) {
+    public static Parser tryToFindExistingLeafParser(String clazz, 
ParseContext context) {
         Parser p = context.get(Parser.class);
+        if (equals(p, clazz)) {
+            return p;
+        }
+        Parser returnParser = null;
         if (p != null) {
-            //try to find the sub parser
+            if (p instanceof ParserDecorator) {
+                p = ((ParserDecorator)p).getWrappedParser();
+            }
+            if (equals(p, clazz)) {
+                return p;
+            }
             if (p instanceof CompositeParser) {
-                Map<MediaType, Parser> map = ((CompositeParser) 
p).getParsers(context);
-                Parser retParser = map.get(mediaType);
-                if (retParser != null) {
-                    return retParser;
-                }
+                returnParser = findInComposite((CompositeParser) p, clazz, 
context);
             }
         }
-        if (p != null && p.getSupportedTypes(context).contains(mediaType)) {
-            return p;
+        if (returnParser != null && equals(returnParser, clazz)) {
+            return returnParser;
+        }
+
+        return null;
+    }
+
+    private static Parser findInComposite(CompositeParser p, String clazz, 
ParseContext context) {
+        Map<MediaType, Parser> map = p.getParsers(context);
+        for (Map.Entry<MediaType, Parser> e : map.entrySet()) {
+            Parser candidate = e.getValue();
+            if (equals(candidate, clazz)) {
+                return candidate;
+            }
+            if (candidate instanceof ParserDecorator) {
+                candidate = ((ParserDecorator)candidate).getWrappedParser();
+            }
+            if (equals(candidate, clazz)) {
+                return candidate;
+            }
+            if (candidate instanceof CompositeParser) {
+                candidate = findInComposite((CompositeParser) candidate, 
clazz, context);
+            }
+            if (equals(candidate, clazz)) {
+                return candidate;
+            }
         }
         return null;
     }
 
+    private static boolean equals(Parser parser, String clazz) {
+        if (parser == null) {
+            return false;
+        }
+        return parser.getClass().getCanonicalName().equals(clazz);
+    }
+
+
+
     public PasswordProvider getPasswordProvider() {
         return context.get(PasswordProvider.class);
     }
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
index e9b0fbf..c9465ff 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
@@ -60,7 +60,7 @@ public class ChmParser extends AbstractParser {
             Metadata metadata, ParseContext context) throws IOException,
             SAXException, TikaException {
         ChmExtractor chmExtractor = new ChmExtractor(stream);
-        Parser htmlProxy = 
EmbeddedDocumentUtil.tryToFindExistingParser(MediaType.TEXT_HTML, context);
+        Parser htmlProxy = 
EmbeddedDocumentUtil.tryToFindExistingLeafParser("org.apache.tika.parser.html.HtmlParser",
 context);
         if (htmlProxy == null) {
             htmlProxy = 
createParserProxy("org.apache.tika.parser.html.HtmlParser");
         }
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index 64d8fdd..19bee8b 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -83,12 +83,13 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
         currencyFormatter = NumberFormat.getCurrencyInstance(locale);
         shortDateTimeFormatter = DateFormat.getDateInstance(DateFormat.SHORT, 
locale);
         this.parseContext = context;
-        Parser tmpHtml = 
EmbeddedDocumentUtil.tryToFindExistingParser(MediaType.TEXT_HTML, context);
-        if (tmpHtml == null) {
-            tmpHtml = new 
ParserProxy("org.apache.tika.parser.html.HtmlParser", 
getClass().getClassLoader());
+        Parser tmpHtmlParser =
+                
EmbeddedDocumentUtil.tryToFindExistingLeafParser("org.apache.tika.parser.html.HtmlParser",
 context);
 
+        if (tmpHtmlParser == null) {
+            tmpHtmlParser = new 
ParserProxy("org.apache.tika.parser.html.HtmlParser", 
getClass().getClassLoader());
         }
-        this.htmlParserProxy = tmpHtml;
+        this.htmlParserProxy = tmpHtmlParser;
     }
 
     public void parse(Database db, XHTMLContentHandler xhtml) throws 
IOException, SAXException, TikaException {
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 8843d69..614bb5b 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -91,7 +91,6 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
     private static final Metadata EMPTY_METADATA = new Metadata();
     private final SimpleDateFormat dateFormat;
     private final EncodingDetector htmlEncodingDetectorProxy;
-    private final Parser htmlParserProxy;
 
     private final MAPIMessage msg;
     private final ParseContext parseContext;
@@ -104,7 +103,6 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         super(context);
         this.parseContext = context;
         this.htmlEncodingDetectorProxy = new 
EncodingDetectorProxy("org.apache.tika.parser.html.HtmlEncodingDetector", 
getClass().getClassLoader());
-        this.htmlParserProxy = new 
ParserProxy("org.apache.tika.parser.html.HtmlParser", 
getClass().getClassLoader());
         this.dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", 
Locale.US);
         try {
             this.msg = new MAPIMessage(root);
@@ -235,8 +233,15 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
                 } else if (htmlChunk instanceof StringChunk) {
                     data = ((StringChunk) htmlChunk).getRawValue();
                 }
+                Parser htmlParser =
+                        EmbeddedDocumentUtil.tryToFindExistingLeafParser(
+                                "org.apache.tika.parser.html.HtmlParser", 
parseContext);
+                if (htmlParser == null) {
+                    htmlParser = new 
ParserProxy("org.apache.tika.parser.html.HtmlParser", 
getClass().getClassLoader());
+                }
+
                 if (data != null) {
-                    htmlParserProxy.parse(
+                    htmlParser.parse(
                             new ByteArrayInputStream(data),
                             new EmbeddedContentHandler(new 
BodyContentHandler(xhtml)),
                             new Metadata(), parseContext
@@ -249,7 +254,9 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
                 MAPIRtfAttribute rtf = new MAPIRtfAttribute(
                         MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), 
chunk.getValue()
                 );
-                Parser rtfParser = 
EmbeddedDocumentUtil.tryToFindExistingParser(RTF, parseContext);
+                Parser rtfParser =
+                        EmbeddedDocumentUtil.tryToFindExistingLeafParser(
+                                RTFParser.class.getCanonicalName(), 
parseContext);
                 if (rtfParser == null) {
                     rtfParser = new RTFParser();
                 }

-- 
To stop receiving notification emails like this one, please contact
"[email protected]" <[email protected]>.

Reply via email to