This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new bd1c7edc5f TIKA-4630-on-main (#2551)
bd1c7edc5f is described below
commit bd1c7edc5f7aa1a72e6898bf408ef6898b1e5da0
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jan 23 17:09:29 2026 -0500
TIKA-4630-on-main (#2551)
* TIKA-4630 -- improve tracking of internal paths (#2548)
---
docs/modules/ROOT/nav.adoc | 1 +
.../ROOT/pages/advanced/embedded-documents.adoc | 252 +++++++++++++++++++++
docs/modules/ROOT/pages/advanced/index.adoc | 1 +
docs/src/main/asciidoc/advanced/index.adoc | 7 +-
docs/src/main/asciidoc/using-tika/index.adoc | 6 +
.../main/asciidoc/using-tika/java-api/index.adoc | 7 +-
pom.xml | 3 +
.../java/org/apache/tika/io/FilenameUtils.java | 14 +-
.../main/java/org/apache/tika/metadata/PST.java | 1 -
.../apache/tika/metadata/TikaCoreProperties.java | 9 +-
.../apache/tika/parser/RecursiveParserWrapper.java | 2 +
.../src/test/java/org/apache/tika/TikaTest.java | 4 +-
.../tika/parser/microsoft/libpst/EmailVisitor.java | 9 +-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 7 +-
.../parser/microsoft/pst/OutlookPSTParser.java | 7 +-
.../parser/microsoft/libpst/TestLibPstParser.java | 11 +-
.../ooxml/OOXMLContainerExtractionTest.java | 2 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 4 +
.../parser/microsoft/pst/OutlookPSTParserTest.java | 4 +-
.../apache/tika/parser/odf/OpenDocumentParser.java | 2 +
.../apache/tika/parser/pkg/CompressorParser.java | 46 +++-
.../org/apache/tika/parser/pkg/PackageParser.java | 3 +-
.../org/apache/tika/parser/wacz/WACZParser.java | 1 +
.../tika/parser/RecursiveParserWrapperTest.java | 47 +++-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +-
.../org/apache/tika/parser/pkg/ZipParserTest.java | 20 +-
26 files changed, 409 insertions(+), 63 deletions(-)
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index b44e6e0031..1b5db90702 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -32,6 +32,7 @@
* xref:advanced/index.adoc[Advanced]
** xref:advanced/robustness.adoc[Robustness]
** xref:advanced/spooling.adoc[Spooling]
+** xref:advanced/embedded-documents.adoc[Embedded Document Metadata]
* xref:faq.adoc[FAQ]
* xref:security.adoc[Security]
* xref:roadmap.adoc[Roadmap]
diff --git a/docs/modules/ROOT/pages/advanced/embedded-documents.adoc
b/docs/modules/ROOT/pages/advanced/embedded-documents.adoc
new file mode 100644
index 0000000000..bd6afbc41d
--- /dev/null
+++ b/docs/modules/ROOT/pages/advanced/embedded-documents.adoc
@@ -0,0 +1,252 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Embedded Document Metadata
+
+When Tika parses container files (such as ZIP archives, emails, PDFs with
attachments, or
+Microsoft Office documents), it extracts embedded documents recursively. Tika
provides
+several metadata fields to help you understand and track the structure of
these embedded
+resources.
+
+== Overview
+
+Understanding embedded document metadata requires distinguishing between two
fundamentally
+different types of information:
+
+* *Containment Structure (Tika-Generated)* - Metadata that Tika generates to
track _how documents
+ are nested within each other_. This answers questions like: "Which file
contained this
+ attachment?" and "What is the nesting depth?"
+
+* *Container Metadata (From the File)* - Metadata that comes from _the
container file itself_,
+ describing what the container knows about its contents. This answers
questions like: "What
+ path was this file stored at inside the archive?" and "What was the original
filename?"
+
+The distinction matters because containers often store embedded files in
internal directory
+structures that are independent of how deeply nested the embedding is. A ZIP
file preserves
+its original folder hierarchy; an OOXML document stores media in `xl/media/`
or `ppt/media/`;
+a PST file organizes emails by folder. This internal organization is separate
from the
+question of containment.
+
+== Containment Structure (Tika-Generated)
+
+These fields are generated by Tika during parsing to track the _nesting
relationships_ between
+documents. They answer: "Which document contained this one?" All fields are
defined in
+`TikaCoreProperties`.
+
+=== Nesting Identifiers
+
+`TikaCoreProperties.EMBEDDED_ID` (`X-TIKA:embedded_id`)::
+A 1-indexed integer assigned by Tika to each embedded document during parsing.
IDs are
+assigned in the order documents are encountered by the
`RecursiveParserWrapper`. This ID
+uniquely identifies each embedded document within a single parse operation.
+
+`TikaCoreProperties.EMBEDDED_ID_PATH` (`X-TIKA:embedded_id_path`)::
+A path showing the containment hierarchy using `EMBEDDED_ID` values. For
example, `/1/3`
+indicates that the file with `EMBEDDED_ID=3` was contained within the file with
+`EMBEDDED_ID=1`. This is the most reliable field for tracking containment
relationships.
++
+NOTE: This is purely about _which document contains which_ - it tells you
nothing about
+folder structures or original paths within the containers themselves.
+
+=== Synthetic Paths
+
+`TikaCoreProperties.EMBEDDED_RESOURCE_PATH` (`X-TIKA:embedded_resource_path`)::
+A synthetic path built by concatenating file names (from `RESOURCE_NAME_KEY`)
at each
+nesting level. This provides a human-readable path through the containment
hierarchy.
++
+WARNING: Do not use this field for creating directory structures to write out
attachments.
+There may be path collisions, illegal characters, or zip slip vulnerabilities.
Use
+`EMBEDDED_ID_PATH` for reliable containment tracking.
+
+`TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH`
(`X-TIKA:final_embedded_resource_path`)::
+Similar to `EMBEDDED_RESOURCE_PATH`, but calculated at the end of the full
parse. For some
+parsers, an embedded file's name isn't known until after its child files have
been parsed.
+This field may have fewer "unknown" file names than `EMBEDDED_RESOURCE_PATH`.
+
+=== Resource Naming
+
+`TikaCoreProperties.RESOURCE_NAME_KEY` (`X-TIKA:resourceName`)::
+The file name (not path) of the resource. Tika makes a best effort to
determine a meaningful
+name from the container's metadata. When unavailable, Tika falls back to
synthetic names
+such as `embedded-1.jpeg`.
++
+NOTE: In Tika 4.x, this field contains only the file name. Use `INTERNAL_PATH`
for the
+full path as stored in the container.
+
+== Container Metadata (From the File)
+
+These fields contain metadata that is stored _within the container file
itself_. This is
+information the container preserves about its contents, independent of how
Tika traverses
+the nesting structure. All fields below are defined in `TikaCoreProperties`.
+
+=== Internal Paths
+
+`TikaCoreProperties.INTERNAL_PATH` (`X-TIKA:internalPath`)::
+The path (including file name) as literally stored within the container. This
is what the
+container knows about where the file lives in its internal structure:
++
+* In a ZIP: the entry path (e.g., `reports/Q1/sales.xlsx`)
+* In a PST: the folder path plus message name (e.g., `Inbox/Important/Meeting
notes.msg`)
+* In an OOXML document: the part name (e.g., `xl/media/image1.png`)
++
+This differs fundamentally from `EMBEDDED_RESOURCE_PATH`:
++
+* `INTERNAL_PATH` is what the _container stores_ about the file's location
within itself
+* `EMBEDDED_RESOURCE_PATH` is what _Tika synthesizes_ from the nesting
structure
+
+`TikaCoreProperties.ORIGINAL_RESOURCE_NAME` (`X-TIKA:origResourceName`)::
+For some file formats, the file path where the document was last saved on the
creator's
+system. For example, an `.xlsx` file named `budget.xlsx` may include a
metadata property
+storing where it was last saved: `C:\Users\Alice\budget.xlsx`. This is not
specific to
+embedded files - it's a property that certain file formats preserve about
themselves.
+
+== Microsoft-Specific Metadata
+
+Microsoft Office formats use additional identifiers for embedded objects.
+
+`TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID`
(`X-TIKA:embeddedRelationshipId`)::
+A Microsoft-specific identifier used internally to reference embedded objects
within
+Office documents. This is the relationship ID from the Office Open XML or OLE
structure.
+
+`Office.EMBEDDED_STORAGE_CLASS_ID` (`msoffice:embeddedStorageClassId`)::
+A UUID that identifies the class of embedded object in Microsoft formats.
While not
+exactly a MIME type, it provides similar information about what type of object
is
+embedded. Defined in the `Office` metadata class.
+
+== Quick Reference
+
+[cols="2,2,1"]
+|===
+|Property |Metadata Key |Source
+
+|`EMBEDDED_ID`
+|`X-TIKA:embedded_id`
+|Containment
+
+|`EMBEDDED_ID_PATH`
+|`X-TIKA:embedded_id_path`
+|Containment
+
+|`EMBEDDED_RESOURCE_PATH`
+|`X-TIKA:embedded_resource_path`
+|Containment
+
+|`FINAL_EMBEDDED_RESOURCE_PATH`
+|`X-TIKA:final_embedded_resource_path`
+|Containment
+
+|`RESOURCE_NAME_KEY`
+|`X-TIKA:resourceName`
+|Containment
+
+|`INTERNAL_PATH`
+|`X-TIKA:internalPath`
+|Container
+
+|`ORIGINAL_RESOURCE_NAME`
+|`X-TIKA:origResourceName`
+|Container
+
+|`EMBEDDED_RELATIONSHIP_ID`
+|`X-TIKA:embeddedRelationshipId`
+|Container (MS)
+
+|`Office.EMBEDDED_STORAGE_CLASS_ID`
+|`msoffice:embeddedStorageClassId`
+|Container (MS)
+|===
+
+== Example: Understanding the Difference
+
+Consider a ZIP file `archive.zip` containing `reports/Q1/sales.xlsx`, where
the spreadsheet
+itself contains an embedded image:
+
+[cols="1,2,2"]
+|===
+|Document |Field |Value
+
+.5+|Container (`archive.zip`)
+|`EMBEDDED_ID`
+|_(not set - this is the root document)_
+
+|`EMBEDDED_ID_PATH`
+|_(not set)_
+
+|`INTERNAL_PATH`
+|_(not set)_
+
+|`RESOURCE_NAME_KEY`
+|`archive.zip`
+
+|`EMBEDDED_RESOURCE_PATH`
+|_(not set)_
+
+.5+|Spreadsheet (`sales.xlsx`)
+|`EMBEDDED_ID`
+|`1`
+
+|`EMBEDDED_ID_PATH`
+|`/1`
+
+|`INTERNAL_PATH`
+|`reports/Q1/sales.xlsx` (from ZIP entry)
+
+|`RESOURCE_NAME_KEY`
+|`sales.xlsx`
+
+|`EMBEDDED_RESOURCE_PATH`
+|`/sales.xlsx`
+
+.5+|Embedded image in spreadsheet
+|`EMBEDDED_ID`
+|`2`
+
+|`EMBEDDED_ID_PATH`
+|`/1/2` (embedded in file with ID=1)
+
+|`INTERNAL_PATH`
+|`xl/media/image1.png` (from XLSX structure)
+
+|`RESOURCE_NAME_KEY`
+|`image1.png`
+
+|`EMBEDDED_RESOURCE_PATH`
+|`/sales.xlsx/image1.png`
+|===
+
+== Key Observations
+
+The table above illustrates the fundamental distinction between containment
tracking and
+container metadata:
+
+*Containment structure (Tika-generated):*
+
+* `EMBEDDED_ID_PATH` `/1/2` tells you that the image (ID=2) was found _inside_
the
+ spreadsheet (ID=1). It answers: "What contains what?"
+* `EMBEDDED_RESOURCE_PATH` `/sales.xlsx/image1.png` is synthesized from file
names at each
+ nesting level. It provides a human-readable path through the containment
hierarchy.
+
+*Container metadata (from the file):*
+
+* `INTERNAL_PATH` for the spreadsheet (`reports/Q1/sales.xlsx`) is what the
_ZIP file knows_
+ about where that entry was stored - its internal folder structure.
+* `INTERNAL_PATH` for the image (`xl/media/image1.png`) is what the _XLSX file
knows_ about
+ where that media file lives - its internal OOXML part name.
+
+Notice that `INTERNAL_PATH` resets at each container boundary. The image's
internal path
+doesn't include `reports/Q1/` because that path information belongs to the ZIP
container,
+not the XLSX container. Each container only knows about its own internal
organization.
diff --git a/docs/modules/ROOT/pages/advanced/index.adoc
b/docs/modules/ROOT/pages/advanced/index.adoc
index 83cc5d15bb..c98f515020 100644
--- a/docs/modules/ROOT/pages/advanced/index.adoc
+++ b/docs/modules/ROOT/pages/advanced/index.adoc
@@ -23,6 +23,7 @@ This section covers advanced usage and internals of Apache
Tika.
* xref:advanced/robustness.adoc[Robustness] - Process isolation and fault
tolerance when parsing untrusted content
* xref:advanced/spooling.adoc[TikaInputStream and Spooling] - Understanding
how TikaInputStream handles buffering, caching, and spooling to disk
+* xref:advanced/embedded-documents.adoc[Embedded Document Metadata] -
Understanding how Tika tracks embedded documents and their paths
// Add links to specific topics as they are created
// * link:custom-parsers.html[Writing Custom Parsers]
diff --git a/docs/src/main/asciidoc/advanced/index.adoc
b/docs/src/main/asciidoc/advanced/index.adoc
index f8350c86b8..6fd0125c1a 100644
--- a/docs/src/main/asciidoc/advanced/index.adoc
+++ b/docs/src/main/asciidoc/advanced/index.adoc
@@ -23,9 +23,4 @@ This section covers advanced usage and internals of Apache
Tika.
* xref:robustness.adoc[Robustness] - Process isolation and fault tolerance
when parsing untrusted content
* xref:spooling.adoc[TikaInputStream and Spooling] - Understanding how
TikaInputStream handles buffering, caching, and spooling to disk
-
-// Add links to specific topics as they are created
-// * link:custom-parsers.html[Writing Custom Parsers]
-// * link:custom-detectors.html[Writing Custom Detectors]
-// * link:configuration.html[Advanced Configuration]
-// * link:performance.html[Performance Tuning]
+* xref:metadata/index.adoc[Metadata Reference] - Documentation for Tika's
metadata fields
diff --git a/docs/src/main/asciidoc/using-tika/index.adoc
b/docs/src/main/asciidoc/using-tika/index.adoc
index ada34abc4c..2f13102e82 100644
--- a/docs/src/main/asciidoc/using-tika/index.adoc
+++ b/docs/src/main/asciidoc/using-tika/index.adoc
@@ -63,3 +63,9 @@ Use Tika via gRPC protocol. Best for high-performance,
cross-language communicat
For processing large volumes of documents, see xref:../pipes/index.adoc[Tika
Pipes],
which provides fault-tolerant, scalable document processing and works with all
of the
above integration methods.
+
+== Understanding the Output
+
+xref:../advanced/metadata/embedded-documents.adoc[Embedded Document Metadata]::
+Learn how Tika tracks and reports metadata for embedded documents
(attachments, images,
+and other resources contained within files).
diff --git a/docs/src/main/asciidoc/using-tika/java-api/index.adoc
b/docs/src/main/asciidoc/using-tika/java-api/index.adoc
index 703a2cf2c2..8ab2b22291 100644
--- a/docs/src/main/asciidoc/using-tika/java-api/index.adoc
+++ b/docs/src/main/asciidoc/using-tika/java-api/index.adoc
@@ -31,8 +31,5 @@ xref:getting-started.adoc[Getting Started] for guidance on
choosing the right ap
== Topics
* xref:getting-started.adoc[Getting Started] - Recommendations and
PipesForkParser usage
-
-// Add links to specific topics as they are created
-// * link:parsing.html[Parsing Documents]
-// * link:detection.html[Content Detection]
-// * link:configuration.html[Configuration]
+* xref:../../advanced/metadata/embedded-documents.adoc[Embedded Document
Metadata] -
+ Understanding attachment and embedded resource tracking
diff --git a/pom.xml b/pom.xml
index 417aab5f35..34995ce498 100644
--- a/pom.xml
+++ b/pom.xml
@@ -199,6 +199,9 @@ least three +1 Tika PMC votes are cast.
<inputExclude>tika-bundle/src/main/resources/META-INF/MANIFEST.MF</inputExclude>
<inputExclude>.gitattributes</inputExclude>
+ <!-- Antora UI supplemental files -->
+ <inputExclude>docs/supplemental-ui/**</inputExclude>
+
<!-- subprojects already checked, added for RAT 0.17, see also
RAT-97 -->
<inputExclude>tika-*/**</inputExclude>
</inputExcludes>
diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
index b06c843c48..b2e2f5d878 100644
--- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
@@ -292,12 +292,14 @@ public class FilenameUtils {
//may return null
private static String getEmbeddedPath(Metadata metadata) {
- //potentially look for other values in embedded path or original file
name, etc...
- //maybe different fallback order?
String path = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
if (! StringUtils.isBlank(path)) {
return path;
}
+ path = metadata.get(TikaCoreProperties.INTERNAL_PATH);
+ if (! StringUtils.isBlank(path)) {
+ return path;
+ }
path = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
if (! StringUtils.isBlank(path)) {
return path;
@@ -311,22 +313,22 @@ public class FilenameUtils {
//this tries for resource name first, and then backs off to path
private static String getEmbeddedName(Metadata metadata) {
- //potentially look for other values in embedded path or original file
name, etc...
- //maybe different fallback order?
String path = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
if (! StringUtils.isBlank(path)) {
return path;
}
+ path = metadata.get(TikaCoreProperties.INTERNAL_PATH);
+ if (! StringUtils.isBlank(path)) {
+ return path;
+ }
path = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
if (! StringUtils.isBlank(path)) {
return path;
}
-
path = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
if (! StringUtils.isBlank(path)) {
return path;
}
-
return metadata.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME);
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PST.java
b/tika-core/src/main/java/org/apache/tika/metadata/PST.java
index 7847e7f5c2..860e3fea77 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PST.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PST.java
@@ -19,7 +19,6 @@ package org.apache.tika.metadata;
public interface PST {
String PST_PREFIX = "pst:";
- Property PST_FOLDER_PATH = Property.internalText(PST_PREFIX +
"folderPath");
Property DESCRIPTOR_NODE_ID = Property.internalText(PST_PREFIX +
"discriptorNodeId");
Property IS_VALID = Property.internalBoolean(PST_PREFIX + "isValid");
}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index b89323fc11..cc712543b5 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -204,12 +204,19 @@ public interface TikaCoreProperties {
Property.internalTextBag(TIKA_META_PREFIX + "origResourceName");
/**
* This should be used to store the path (relative or full)
- * of the source file, including the file name,
+ * of the source/container file, including the file name,
* e.g. doc/path/to/my_pdf.pdf
* <p>
* This can also be used for a primary key within a database.
*/
Property SOURCE_PATH = Property.internalText(TIKA_META_PREFIX +
"sourcePath");
+
+ /**
+ * This records the metadata as stored within a file for an embedded
file's path
+ * including the file name. For example a zip file may include an msg with
this path: /my-emails/important/this.msg
+ */
+ Property INTERNAL_PATH = Property.internalText(TIKA_META_PREFIX +
"internalPath");
+
/**
* This is currently used to identify Content-Type that may be
* included within a document, such as in html documents
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index f774e8efe6..4fd099454c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -185,6 +185,8 @@ public class RecursiveParserWrapper extends ParserDecorator
{
String objectName = "";
if (metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY) != null) {
objectName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+ } else if (metadata.get(TikaCoreProperties.INTERNAL_PATH) != null) {
+ objectName =
FilenameUtils.getName(metadata.get(TikaCoreProperties.INTERNAL_PATH));
} else if (metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID)
!= null) {
objectName =
metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
} else if (metadata.get(TikaCoreProperties.VERSION_NUMBER) != null) {
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 592447e2e0..a8e37a85b2 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -496,8 +496,10 @@ public abstract class TikaTest {
RecursiveParserWrapperHandler handler = new
RecursiveParserWrapperHandler(
new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
FilenameUtils.getName(filePath));
try (TikaInputStream tis = getResourceAsStream("/test-documents/" +
filePath)) {
- wrapper.parse(tis, handler, new Metadata(), context);
+ wrapper.parse(tis, handler, metadata, context);
}
return handler.getMetadataList();
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java
index 30070bc23e..6d5f8c06d2 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java
@@ -28,7 +28,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.PST;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -73,10 +73,11 @@ public class EmailVisitor implements FileVisitor<Path> {
private void process(Path file) throws IOException {
Metadata emailMetadata = new Metadata();
- String pstPath = root
- .relativize(file.getParent())
+ String internalPath = root
+ .relativize(file)
.toString();
- emailMetadata.set(PST.PST_FOLDER_PATH, pstPath);
+ emailMetadata.set(TikaCoreProperties.INTERNAL_PATH, internalPath);
+ emailMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
file.getFileName().toString());
try (TikaInputStream tis = TikaInputStream.get(file)) {
try {
embeddedDocumentExtractor.parseEmbedded(tis, xhtml,
emailMetadata, new ParseContext(), true);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index e357388f17..e1e8a24503 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -55,6 +55,7 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -183,7 +184,9 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
try (InputStream tStream = tPart.getInputStream()) {
Metadata thumbnailMetadata = new Metadata();
String thumbName = tPart.getPartName().getName();
-
thumbnailMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, thumbName);
+ thumbnailMetadata.set(TikaCoreProperties.INTERNAL_PATH,
thumbName);
+ thumbnailMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+ FilenameUtils.getName(thumbName));
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute(XHTML, "class", "class", "CDATA",
"embedded");
@@ -348,6 +351,7 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name());
metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, rel);
+ metadata.set(TikaCoreProperties.INTERNAL_PATH,
part.getPartName().getName());
DirectoryNode root = fs.getRoot();
POIFSDocumentType type = POIFSDocumentType.detectType(root);
@@ -454,6 +458,7 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, rel);
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
embeddedResourceType.name());
+ metadata.set(TikaCoreProperties.INTERNAL_PATH,
part.getPartName().getName());
// Get the name
updateResourceName(part, embeddedPartMetadata, metadata);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
index 44d470cd32..485c96eeef 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
@@ -116,8 +116,11 @@ public class OutlookPSTParser implements Parser {
while (pstMail != null) {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
PSTMailItemParser.PST_MAIL_ITEM_STRING);
- metadata.set(PST.PST_FOLDER_PATH, folderPath);
- metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
pstMail.getSubject() + ".msg");
+ String resourceName = pstMail.getSubject() + ".msg";
+ String internalPath = folderPath.endsWith("/") ?
+ folderPath + resourceName : folderPath + "/" +
resourceName;
+ metadata.set(TikaCoreProperties.INTERNAL_PATH, internalPath);
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
resourceName);
long length = estimateSize(pstMail);
try (TikaInputStream tis =
TikaInputStream.getFromContainer(pstMail, length, metadata)) {
embeddedExtractor.parseEmbedded(tis, handler, metadata,
new ParseContext(), true);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
index cc75a774d4..8bb674786f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
@@ -29,7 +29,6 @@ import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.metadata.MAPI;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.PST;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.Parser;
@@ -61,9 +60,8 @@ public class TestLibPstParser extends TikaTest {
for (int i = 1; i < metadataList.size(); i++) {
String path = metadataList
.get(i)
- .get(PST.PST_FOLDER_PATH);
- if (path != null) {
- assertEquals("hong-thai.nguyen", path);
+ .get(TikaCoreProperties.INTERNAL_PATH);
+ if (path != null && path.startsWith("hong-thai.nguyen/")) {
validPaths++;
}
}
@@ -100,9 +98,8 @@ public class TestLibPstParser extends TikaTest {
for (int i = 1; i < metadataList.size(); i++) {
String path = metadataList
.get(i)
- .get(PST.PST_FOLDER_PATH);
- if (path != null) {
- assertEquals("hong-thai.nguyen", path);
+ .get(TikaCoreProperties.INTERNAL_PATH);
+ if (path != null && path.startsWith("hong-thai.nguyen/")) {
validPaths++;
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
index dfe86f2040..6dfda6e553 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
@@ -262,7 +262,7 @@ public class OOXMLContainerExtractionTest extends
AbstractPOIContainerExtraction
assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx",
handler.filenames.get(6));
assertEquals("Microsoft_Office_Word_Document2.docx",
handler.filenames.get(7));
assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc",
handler.filenames.get(8));
- assertEquals("/docProps/thumbnail.jpeg", handler.filenames.get(9));
+ assertEquals("thumbnail.jpeg", handler.filenames.get(9));
// But we do know their types
assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of
embedded office doc
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 1b7b08db50..2538f3b7b2 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft.ooxml;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
@@ -1704,6 +1705,9 @@ public class OOXMLParserTest extends
MultiThreadedTikaTest {
assertEquals("audio/mpeg",
metadataList.get(1).get(Metadata.CONTENT_TYPE));
assertEquals("image/png",
metadataList.get(2).get(Metadata.CONTENT_TYPE));
assertEquals("image/jpeg",
metadataList.get(3).get(Metadata.CONTENT_TYPE));
+ // Verify INTERNAL_PATH is set for embedded media
+
assertNotNull(metadataList.get(1).get(TikaCoreProperties.INTERNAL_PATH));
+
assertTrue(metadataList.get(1).get(TikaCoreProperties.INTERNAL_PATH).contains("/ppt/media/"));
}
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
index e73c6c9fef..3adb47ac82 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
@@ -29,7 +29,6 @@ import org.apache.tika.TikaTest;
import org.apache.tika.metadata.MAPI;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.PST;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.Parser;
@@ -77,7 +76,8 @@ public class OutlookPSTParserTest extends TikaTest {
assertEquals("Jörn Kottmann", m1.get(MAPI.FROM_REPRESENTING_NAME));
assertEquals("[email protected]",
m1.get(MAPI.FROM_REPRESENTING_EMAIL));
assertEquals("NOTE", m1.get(MAPI.MESSAGE_CLASS));
- assertEquals("/Début du fichier de données Outlook",
m1.get(PST.PST_FOLDER_PATH));
+ assertEquals("/Début du fichier de données Outlook/Re: Feature
Generators.msg",
+ m1.get(TikaCoreProperties.INTERNAL_PATH));
//test that subject is making it into the xhtml
assertContains("<meta name=\"dc:subject\" content=\"Re: Feature
Generators\"", m1.get(TikaCoreProperties.TIKA_CONTENT));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index e6f192ee0f..ed1ca792c8 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -266,6 +266,7 @@ public class OpenDocumentParser implements Parser {
if (embeddedName.contains("Thumbnails/") ||
embeddedName.contains("Pictures/")) {
Metadata embeddedMetadata = new Metadata();
+ embeddedMetadata.set(TikaCoreProperties.INTERNAL_PATH,
embeddedName);
embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
entry.getName());
if (embeddedName.startsWith("Thumbnails/")) {
@@ -310,6 +311,7 @@ public class OpenDocumentParser implements Parser {
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+ embeddedMetadata.set(TikaCoreProperties.INTERNAL_PATH, embeddedName);
handler = new OpenDocumentMacroHandler(handler, context);
try {
tisZip.setCloseShield();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index 03be853bb5..e36be1c397 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -47,6 +47,7 @@ import
org.apache.commons.compress.compressors.CompressorStreamFactory;
import
org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import
org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipParameters;
import org.apache.commons.compress.compressors.gzip.GzipUtils;
import org.apache.commons.compress.compressors.lzma.LZMACompressorInputStream;
import
org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream;
@@ -71,6 +72,7 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
/**
* Parser for various compression formats.
@@ -236,21 +238,12 @@ public class CompressorParser implements Parser {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
-
try {
Metadata entrydata = new Metadata();
- String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
- if (name != null) {
- if (name.endsWith(".tbz") || name.endsWith(".tbz2")) {
- name = name.substring(0, name.lastIndexOf(".")) + ".tar";
- } else if (name.endsWith(".bz") || name.endsWith(".bz2") ||
name.endsWith(".xz") ||
- name.endsWith(".zlib") || name.endsWith(".pack") ||
name.endsWith(".br")) {
- name = name.substring(0, name.lastIndexOf("."));
- } else if (name.length() > 0) {
- name = GzipUtils.getUncompressedFileName(name);
- }
- entrydata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+ if (cis instanceof GzipCompressorInputStream) {
+ extractGzipMetadata((GzipCompressorInputStream) cis,
entrydata);
}
+ setName(metadata, entrydata);
// Use the delegate parser to parse the compressed document
EmbeddedDocumentExtractor extractor =
@@ -268,6 +261,35 @@ public class CompressorParser implements Parser {
xhtml.endDocument();
}
+ private void extractGzipMetadata(GzipCompressorInputStream gzcis, Metadata
metadata) {
+ GzipParameters gzipParameters = gzcis.getMetaData();
+ if (gzipParameters == null) {
+ return;
+ }
+ String name = gzipParameters.getFileName();
+ if (!StringUtils.isBlank(name)) {
+ metadata.set(TikaCoreProperties.INTERNAL_PATH, name);
+ }
+ //TODO: modification, OS, comment
+ }
+
+ private void setName(Metadata parentMetadata, Metadata metadata) {
+ String name = parentMetadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+ //if parent's name is blank stop now
+ if (StringUtils.isBlank(name)) {
+ return;
+ }
+ if (name.endsWith(".tgz") || name.endsWith(".tbz") ||
name.endsWith(".tbz2")) {
+ name = name.substring(0, name.lastIndexOf(".")) + ".tar";
+ } else if (name.endsWith(".bz") || name.endsWith("gz") ||
name.endsWith(".bz2") || name.endsWith(".xz") || name.endsWith(".zlib") ||
name.endsWith(".pack") ||
+ name.endsWith(".br")) {
+ name = name.substring(0, name.lastIndexOf("."));
+ } else if (!name.isEmpty()) {
+ name = GzipUtils.getUncompressedFileName(name);
+ }
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+ }
+
/**
* @param metadata
* @return CompressorStream name based on the content-type value
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 5b8aecbc0f..26970002be 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -217,13 +217,12 @@ public class PackageParser extends
AbstractEncodingDetectorParser {
if (name != null && name.length() > 0) {
name = name.replace("\\", "/");
entrydata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+ entrydata.set(TikaCoreProperties.INTERNAL_PATH, name);
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", name);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
-
- entrydata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, name);
}
return entrydata;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java
index 838ce3f5bf..57306fa5a6 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java
@@ -113,6 +113,7 @@ public class WACZParser implements Parser {
String name, XHTMLContentHandler xhtml, Metadata
parentMetadata,
EmbeddedDocumentExtractor ex) throws IOException,
SAXException {
Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.INTERNAL_PATH, zae.getName());
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
metadata.set(Metadata.CONTENT_LENGTH, Long.toString(zae.getSize()));
try (TikaInputStream tis =
TikaInputStream.get(getMaybeGzipInputStream(TikaInputStream.get(zais)))) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 5fada5528d..9b054d4ad4 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -23,9 +23,11 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.io.InputStream;
+import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
+import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.ClosedInputStream;
@@ -127,11 +129,54 @@ public class RecursiveParserWrapperTest extends TikaTest {
List<Metadata> list = handler.getMetadataList();
assertEquals(12, list.size());
}
+
+ @Test
+ public void testTarball() throws Exception {
+ List<Metadata> list = getRecursiveMetadata("test-documents.tgz");
+ List<String> actualInternalPaths =
+ list.stream()
+ .map(m -> m.get(TikaCoreProperties.INTERNAL_PATH))
+ .collect(Collectors.toList());
+
+ List<String> expectedInternalPaths = Arrays.asList(null,
+ "test-documents/testEXCEL.xls",
+ "test-documents/testHTML.html",
+ "Thumbnails/thumbnail.png",
+ "Thumbnails/thumbnail.pdf",
+ "test-documents/testOpenOffice2.odt",
+ "test-documents/testPDF.pdf",
+ "test-documents/testPPT.ppt",
+ "test-documents/testRTF.rtf",
+ "test-documents/testTXT.txt",
+ "test-documents/testWORD.doc",
+ "test-documents/testXML.xml",
+ "test-documents.tar");
+ assertEquals(expectedInternalPaths, actualInternalPaths);
+
+ List<String> actualEmbeddedPaths =
+ list.stream()
+ .map(m -> m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH))
+ .collect(Collectors.toList());
+ assertEquals(Arrays.asList(null,
+ "/test-documents.tar/testEXCEL.xls",
+ "/test-documents.tar/testHTML.html",
+ "/test-documents.tar/testOpenOffice2.odt/thumbnail.png",
+ "/test-documents.tar/testOpenOffice2.odt/thumbnail.pdf",
+ "/test-documents.tar/testOpenOffice2.odt",
+ "/test-documents.tar/testPDF.pdf",
+ "/test-documents.tar/testPPT.ppt",
+ "/test-documents.tar/testRTF.rtf",
+ "/test-documents.tar/testTXT.txt",
+ "/test-documents.tar/testWORD.doc",
+ "/test-documents.tar/testXML.xml",
+ "/test-documents.tar"), actualEmbeddedPaths);
+ }
+
@Test
public void testCharLimitNoThrowOnWriteLimit() throws Exception {
ParseContext context = new ParseContext();
Metadata metadata = new Metadata();
- int writeLimit = 500;
+ int writeLimit = 510;
RecursiveParserWrapper wrapper = new
RecursiveParserWrapper(AUTO_DETECT_PARSER);
RecursiveParserWrapperHandler handler = new
RecursiveParserWrapperHandler(
new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index cda95cc56b..11797a499e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -368,7 +368,7 @@ public class PDFParserTest extends TikaTest {
assertEquals("91",
metadatas.get(1).get(ImageMetadataExtractor.UNKNOWN_IMG_NS + "height"));
assertEquals("352",
metadatas.get(1).get(ImageMetadataExtractor.UNKNOWN_IMG_NS + "width"));
- assertNull(metadatas.get(0).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("testPDF_JBIG2.pdf",
metadatas.get(0).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals("image0.jb2",
metadatas.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals(MediaType.image("x-jbig2").toString(),
metadatas.get(1).get(Metadata.CONTENT_TYPE));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 04a00c5028..8a22855bd8 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -77,17 +77,17 @@ public class ZipParserTest extends AbstractPkgTest {
assertContains("<div class=\"embedded\" id=\"test1.txt\" />", xml);
assertContains("<div class=\"embedded\" id=\"test2.txt\" />", xml);
- // Also make sure EMBEDDED_RELATIONSHIP_ID was
+ // Also make sure INTERNAL_PATH was
// passed when parsing the embedded docs:
ParseContext context = new ParseContext();
- GatherRelIDsDocumentExtractor relIDs = new
GatherRelIDsDocumentExtractor();
- context.set(EmbeddedDocumentExtractor.class, relIDs);
+ GatherInternalPathsDocumentExtractor extractor = new
GatherInternalPathsDocumentExtractor();
+ context.set(EmbeddedDocumentExtractor.class, extractor);
try (TikaInputStream tis =
getResourceAsStream("/test-documents/testEmbedded.zip")) {
AUTO_DETECT_PARSER.parse(tis, new BodyContentHandler(), new
Metadata(), context);
}
- assertTrue(relIDs.allRelIDs.contains("test1.txt"));
- assertTrue(relIDs.allRelIDs.contains("test2.txt"));
+ assertTrue(extractor.allInternalPaths.contains("test1.txt"));
+ assertTrue(extractor.allInternalPaths.contains("test2.txt"));
}
@Test
@@ -123,13 +123,13 @@ public class ZipParserTest extends AbstractPkgTest {
results.get(4).get("X-TIKA:EXCEPTION:embedded_exception"));
}
- private static class GatherRelIDsDocumentExtractor implements
EmbeddedDocumentExtractor {
- public Set<String> allRelIDs = new HashSet<>();
+ private static class GatherInternalPathsDocumentExtractor implements
EmbeddedDocumentExtractor {
+ public Set<String> allInternalPaths = new HashSet<>();
public boolean shouldParseEmbedded(Metadata metadata) {
- String relID =
metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
- if (relID != null) {
- allRelIDs.add(relID);
+ String internalPath =
metadata.get(TikaCoreProperties.INTERNAL_PATH);
+ if (internalPath != null) {
+ allInternalPaths.add(internalPath);
}
return false;
}