This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 88bff551f TIKA-3741 -- fix regression in handling embedded exceptions
in ppt
88bff551f is described below
commit 88bff551fd05a3d7193291dcd3a98af56f38471a
Author: tallison <[email protected]>
AuthorDate: Wed Apr 27 09:01:18 2022 -0400
TIKA-3741 -- fix regression in handling embedded exceptions in ppt
---
.../tika/parser/microsoft/HSLFExtractor.java | 67 ++++++++++++----------
.../parser/microsoft/PowerPointParserTest.java | 8 +--
2 files changed, 40 insertions(+), 35 deletions(-)
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index 356c47e6a..50a19938d 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -503,44 +503,49 @@ public class HSLFExtractor extends AbstractPOIFSExtractor
{
InputStream dataStream = null;
try {
dataStream = data.getInputStream();
+ } catch (SecurityException e) {
+ throw e;
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
parentMetadata);
continue;
}
- try (TikaInputStream stream =
TikaInputStream.get(dataStream)) {
- String mediaType = null;
- if ("Excel.Chart.8".equals(oleShape.getProgId())) {
- mediaType = "application/vnd.ms-excel";
- } else {
- MediaType mt =
getTikaConfig().getDetector().detect(stream, new Metadata());
- mediaType = mt.toString();
- }
- if
(mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")
- ||
mediaType.equals("application/x-tika-msoffice")) {
- POIFSFileSystem poifs = null;
-
- try {
- poifs = new POIFSFileSystem(new
CloseShieldInputStream(stream));
- } catch (RuntimeException e) {
- throw new IOExceptionWithCause(e);
- }
- try {
- handleEmbeddedOfficeDoc(poifs.getRoot(),
objID, xhtml);
- } finally {
- if (poifs != null) {
- poifs.close();
- }
- }
- } else {
- handleEmbeddedResource(
- stream, objID, objID,
- mediaType, xhtml, false);
- }
- } catch (IOException e) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
parentMetadata);
+ handleData(objID, oleShape.getProgId(), dataStream, xhtml);
+ }
+ }
+ }
+ }
+
+ private void handleData(String objID, String progId, InputStream
dataStream,
+ XHTMLContentHandler xhtml) {
+
+ try (TikaInputStream stream = TikaInputStream.get(dataStream)) {
+ String mediaType = null;
+ if ("Excel.Chart.8".equals(progId)) {
+ mediaType = "application/vnd.ms-excel";
+ } else {
+ MediaType mt = getTikaConfig().getDetector().detect(stream,
new Metadata());
+ mediaType = mt.toString();
+ }
+ if (mediaType.equals("application/x-tika-msoffice-embedded;
format=comp_obj")
+ || mediaType.equals("application/x-tika-msoffice")) {
+ POIFSFileSystem poifs = new POIFSFileSystem(new
CloseShieldInputStream(stream));
+
+ try {
+ handleEmbeddedOfficeDoc(poifs.getRoot(), objID, xhtml);
+ } finally {
+ if (poifs != null) {
+ poifs.close();
}
}
+ } else {
+ handleEmbeddedResource(
+ stream, objID, objID,
+ mediaType, xhtml, false);
}
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ EmbeddedDocumentUtil.recordException(e, parentMetadata);
}
}
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 658c0d572..ce60154bc 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -327,15 +327,15 @@ public class PowerPointParserTest extends TikaTest {
XMLResult r = getXML("testPPT_skipBadCompressedObject.ppt");
assertContains("NASA Human", r.xml);
assertEquals(2,
-
r.metadata.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM).length);
+
r.metadata.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
assertContains("incorrect data check",
-
r.metadata.get(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM));
+
r.metadata.get(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING));
List<Metadata> metadataList =
getRecursiveMetadata("testPPT_skipBadCompressedObject.ppt");
assertEquals(2,
-
metadataList.get(0).getValues(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM).length);
+
metadataList.get(0).getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
assertContains("incorrect data check",
-
metadataList.get(0).get(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM));
+
metadataList.get(0).get(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING));
}
@Test(expected = EncryptedDocumentException.class)