This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch 2.x in repository https://gitbox.apache.org/repos/asf/tika.git
commit b2a462c6d2e48d525543c1132671100fc7ff0e6f Author: tballison <[email protected]> AuthorDate: Mon Feb 27 13:34:43 2017 -0500 TIKA 2276 -- cleanup --- .../tika/extractor/EmbeddedDocumentUtil.java | 67 +++++++++++++++++----- .../java/org/apache/tika/parser/chm/ChmParser.java | 2 +- .../tika/parser/microsoft/JackcessExtractor.java | 9 +-- .../tika/parser/microsoft/OutlookExtractor.java | 15 +++-- 4 files changed, 71 insertions(+), 22 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java index a23c6da..3c3531e 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java @@ -35,6 +35,7 @@ import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.utils.ExceptionUtils; import org.xml.sax.ContentHandler; @@ -92,33 +93,73 @@ public class EmbeddedDocumentUtil implements Serializable { /** * Tries to find an existing parser within the ParseContext. - * Initially tries to find first child parser specific to that mediaType. - * Then backs off to the overall parser. + * It looks inside of CompositeParsers and ParserDecorators. + * The use case is when a parser needs to parse an internal stream + * that is _part_ of the document, e.g. rtf body inside an msg. + * <p/> * Can return <code>null</code> if the context contains no parser or - * if an appropriate parser can't be found. + * the correct parser can't be found. * - * @param mediaType + * @param clazz parser class to search for * @param context * @return */ - public static Parser tryToFindExistingParser(MediaType mediaType, ParseContext context) { + public static Parser tryToFindExistingLeafParser(String clazz, ParseContext context) { Parser p = context.get(Parser.class); + if (equals(p, clazz)) { + return p; + } + Parser returnParser = null; if (p != null) { - //try to find the sub parser + if (p instanceof ParserDecorator) { + p = ((ParserDecorator)p).getWrappedParser(); + } + if (equals(p, clazz)) { + return p; + } if (p instanceof CompositeParser) { - Map<MediaType, Parser> map = ((CompositeParser) p).getParsers(context); - Parser retParser = map.get(mediaType); - if (retParser != null) { - return retParser; - } + returnParser = findInComposite((CompositeParser) p, clazz, context); } } - if (p != null && p.getSupportedTypes(context).contains(mediaType)) { - return p; + if (returnParser != null && equals(returnParser, clazz)) { + return returnParser; + } + + return null; + } + + private static Parser findInComposite(CompositeParser p, String clazz, ParseContext context) { + Map<MediaType, Parser> map = p.getParsers(context); + for (Map.Entry<MediaType, Parser> e : map.entrySet()) { + Parser candidate = e.getValue(); + if (equals(candidate, clazz)) { + return candidate; + } + if (candidate instanceof ParserDecorator) { + candidate = ((ParserDecorator)candidate).getWrappedParser(); + } + if (equals(candidate, clazz)) { + return candidate; + } + if (candidate instanceof CompositeParser) { + candidate = findInComposite((CompositeParser) candidate, clazz, context); + } + if (equals(candidate, clazz)) { + return candidate; + } } return null; } + private static boolean equals(Parser parser, String clazz) { + if (parser == null) { + return false; + } + return parser.getClass().getCanonicalName().equals(clazz); + } + + + public PasswordProvider getPasswordProvider() { return context.get(PasswordProvider.class); } diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java index e9b0fbf..c9465ff 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java @@ -60,7 +60,7 @@ public class ChmParser extends AbstractParser { Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { ChmExtractor chmExtractor = new ChmExtractor(stream); - Parser htmlProxy = EmbeddedDocumentUtil.tryToFindExistingParser(MediaType.TEXT_HTML, context); + Parser htmlProxy = EmbeddedDocumentUtil.tryToFindExistingLeafParser("org.apache.tika.parser.html.HtmlParser", context); if (htmlProxy == null) { htmlProxy = createParserProxy("org.apache.tika.parser.html.HtmlParser"); } diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java index 64d8fdd..19bee8b 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java @@ -83,12 +83,13 @@ class JackcessExtractor extends AbstractPOIFSExtractor { currencyFormatter = NumberFormat.getCurrencyInstance(locale); shortDateTimeFormatter = DateFormat.getDateInstance(DateFormat.SHORT, locale); this.parseContext = context; - Parser tmpHtml = EmbeddedDocumentUtil.tryToFindExistingParser(MediaType.TEXT_HTML, context); - if (tmpHtml == null) { - tmpHtml = new ParserProxy("org.apache.tika.parser.html.HtmlParser", getClass().getClassLoader()); + Parser tmpHtmlParser = + EmbeddedDocumentUtil.tryToFindExistingLeafParser("org.apache.tika.parser.html.HtmlParser", context); + if (tmpHtmlParser == null) { + tmpHtmlParser = new ParserProxy("org.apache.tika.parser.html.HtmlParser", getClass().getClassLoader()); } - this.htmlParserProxy = tmpHtml; + this.htmlParserProxy = tmpHtmlParser; } public void parse(Database db, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index 8843d69..614bb5b 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -91,7 +91,6 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { private static final Metadata EMPTY_METADATA = new Metadata(); private final SimpleDateFormat dateFormat; private final EncodingDetector htmlEncodingDetectorProxy; - private final Parser htmlParserProxy; private final MAPIMessage msg; private final ParseContext parseContext; @@ -104,7 +103,6 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { super(context); this.parseContext = context; this.htmlEncodingDetectorProxy = new EncodingDetectorProxy("org.apache.tika.parser.html.HtmlEncodingDetector", getClass().getClassLoader()); - this.htmlParserProxy = new ParserProxy("org.apache.tika.parser.html.HtmlParser", getClass().getClassLoader()); this.dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US); try { this.msg = new MAPIMessage(root); @@ -235,8 +233,15 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } else if (htmlChunk instanceof StringChunk) { data = ((StringChunk) htmlChunk).getRawValue(); } + Parser htmlParser = + EmbeddedDocumentUtil.tryToFindExistingLeafParser( + "org.apache.tika.parser.html.HtmlParser", parseContext); + if (htmlParser == null) { + htmlParser = new ParserProxy("org.apache.tika.parser.html.HtmlParser", getClass().getClassLoader()); + } + if (data != null) { - htmlParserProxy.parse( + htmlParser.parse( new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext @@ -249,7 +254,9 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { MAPIRtfAttribute rtf = new MAPIRtfAttribute( MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue() ); - Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingParser(RTF, parseContext); + Parser rtfParser = + EmbeddedDocumentUtil.tryToFindExistingLeafParser( + RTFParser.class.getCanonicalName(), parseContext); if (rtfParser == null) { rtfParser = new RTFParser(); } -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
