This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 29be3239e TIKA-4374 -- embedded file path names (#2197)
29be3239e is described below
commit 29be3239e92caf667b3eb8970009734f53513a08
Author: Tim Allison <[email protected]>
AuthorDate: Thu May 8 10:05:27 2025 -0400
TIKA-4374 -- embedded file path names (#2197)
---
.../org/apache/tika/eval/app/AbstractProfiler.java | 4 +-
.../src/main/resources/comparison-reports.xml | 155 +++++++++++++++++----
.../src/main/resources/profile-reports.xml | 15 +-
3 files changed, 143 insertions(+), 31 deletions(-)
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
index 7e0a6a3aa..53e99aba6 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
@@ -401,9 +401,9 @@ public abstract class AbstractProfiler extends
FileResourceConsumer {
data.put(Cols.EMBEDDED_DEPTH, "0");
} else {
data.put(Cols.IS_EMBEDDED, TRUE);
- String embeddedFilePath =
m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
+ String embeddedFilePath =
m.get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH);
if (!StringUtils.isBlank(embeddedFilePath)) {
- data.put(Cols.FILE_NAME, getFileName(m.get(embeddedFilePath)));
+ data.put(Cols.FILE_NAME, getFileName(embeddedFilePath));
data.put(Cols.EMBEDDED_FILE_PATH, embeddedFilePath);
}
if
(!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_DEPTH))) {
diff --git a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
index 60f694396..667578daa 100644
--- a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
@@ -454,9 +454,18 @@
<sql>
select concat(ma.mime_string, ' -> ', mb.mime_string) as
MIME_A_TO_MIME_B,
- file_path, a.file_name,
- c.length as CONTAINER_LENGTH,
- a.file_name
+ file_path,
+ case
+ when a.embedded_depth > 0
+ then a.embedded_file_path
+ else a.file_name
+ end as FILE_NAME_A,
+ case
+ when b.embedded_depth > 0
+ then b.embedded_file_path
+ else b.file_name
+ end as FILE_NAME_B,
+ c.length as CONTAINER_LENGTH
from profiles_a a
join profiles_b b on a.id=b.id
join mimes ma on ma.mime_id=a.mime_id
@@ -576,7 +585,11 @@
<sql>
select file_path as FILE_PATH,
- pa.file_name as FILE_NAME_A,
+ case
+ when pa.embedded_depth > 0
+ then pa.embedded_file_path
+ else pa.file_name
+ end as FILE_NAME_A,
c.length as CONTAINER_LENGTH,
ca.NUM_TOKENS as NUM_TOKENS_A,
cb.NUM_TOKENS as NUM_TOKENS_B,
@@ -630,12 +643,20 @@
<sql>
select
file_path,
- pa.file_name as FILE_NAME_A,
- pb.file_name as FILE_NAME_B,
+ case
+ when pa.embedded_depth > 0
+ then pa.embedded_file_path
+ else pa.file_name
+ end as FILE_NAME_A,
+ case
+ when pb.embedded_depth > 0
+ then pb.embedded_file_path
+ else pb.file_name
+ end as FILE_NAME_B,
c.length as CONTAINER_LENGTH,
ma.mime_string as MIME_TYPE_A,
mb.mime_string as MIME_TYPE_B,
- pa.file_name, pa.is_embedded
+ pa.is_embedded
from exceptions_a ea
left join exceptions_b eb on ea.id = eb.id
join profiles_a pa on pa.id=ea.id
@@ -655,8 +676,16 @@
<sql>
select file_path,
- pa.file_name as FILE_NAME_A,
- pb.file_name as FILE_NAME_B,
+ case
+ when pa.embedded_depth > 0
+ then pa.embedded_file_path
+ else pa.file_name
+ end as FILE_NAME_A,
+ case
+ when pb.embedded_depth > 0
+ then pb.embedded_file_path
+ else pb.file_name
+ end as FILE_NAME_B,
c.length as CONTAINER_LENGTH,
ma.mime_string as MIME_TYPE_A,
mb.mime_string as MIME_TYPE_B,
@@ -728,8 +757,16 @@
<sql>
select file_path,
- pa.file_name as FILE_NAME_A,
- pb.file_name as FILE_NAME_B,
+ case
+ when pa.embedded_depth > 0
+ then pa.embedded_file_path
+ else pa.file_name
+ end as FILE_NAME_A,
+ case
+ when pb.embedded_depth > 0
+ then pb.embedded_file_path
+ else pb.file_name
+ end as FILE_NAME_B,
c.length as CONTAINER_LENGTH,
ma.mime_string as MIME_TYPE_A,
mb.mime_string as MIME_TYPE_B,
@@ -771,7 +808,11 @@
<sql>
select file_path,
- p.file_name as FILE_NAME_A,
+ case
+ when p.embedded_depth > 0
+ then p.embedded_file_path
+ else p.file_name
+ end as FILE_NAME_A,
c.length as CONTAINER_LENGTH,
mime_string as MIME_TYPE,
orig_stack_trace, sort_stack_trace
@@ -791,7 +832,11 @@
<sql>
select file_path,
- p.file_name as FILE_NAME_B,
+ case
+ when p.embedded_depth > 0
+ then p.embedded_file_path
+ else p.file_name
+ end as FILE_NAME_B,
c.length as CONTAINER_LENGTH,
mime_string as MIME_TYPE,
orig_stack_trace, sort_stack_trace
@@ -949,8 +994,16 @@
<sql>
select file_path,
- pa.file_name as FILE_NAME_A,
- pb.file_name as FILE_NAME_B,
+ case
+ when pa.embedded_depth > 0
+ then pa.embedded_file_path
+ else pa.file_name
+ end as FILE_NAME_A,
+ case
+ when pb.embedded_depth > 0
+ then pb.embedded_file_path
+ else pb.file_name
+ end as FILE_NAME_B,
c.length as CONTAINER_LENGTH,
ma.mime_string as MIME_STRING_A,
mb.mime_string as MIME_STRING_B,
@@ -1001,8 +1054,16 @@
<sql>
select file_path,
- pa.file_name as FILE_NAME_A,
- pb.file_name as FILE_NAME_B,
+ case
+ when pa.embedded_depth > 0
+ then pa.embedded_file_path
+ else pa.file_name
+ end as FILE_NAME_A,
+ case
+ when pb.embedded_depth > 0
+ then pb.embedded_file_path
+ else pb.file_name
+ end as FILE_NAME_B,
c.length as CONTAINER_LENGTH,
ma.mime_string as MIME_STRING_A,
mb.mime_string as MIME_STRING_B,
@@ -1065,8 +1126,16 @@
<sql>
select file_path,
- pa.file_name as FILE_NAME_A,
- pb.file_name as FILE_NAME_B,
+ case
+ when pa.embedded_depth > 0
+ then pa.embedded_file_path
+ else pa.file_name
+ end as FILE_NAME_A,
+ case
+ when pb.embedded_depth > 0
+ then pb.embedded_file_path
+ else pb.file_name
+ end as FILE_NAME_B,
c.length as CONTAINER_LENGTH,
ma.mime_string as MIME_STRING_A,
mb.mime_string as MIME_STRING_B,
@@ -1170,8 +1239,16 @@
<sql>
select file_path,
- pa.file_name as FILE_NAME_A,
- pb.file_name as FILE_NAME_B,
+ case
+ when pa.embedded_depth > 0
+ then pa.embedded_file_path
+ else pa.file_name
+ end as FILE_NAME_A,
+ case
+ when pb.embedded_depth > 0
+ then pb.embedded_file_path
+ else pb.file_name
+ end as FILE_NAME_B,
c.length as CONTAINER_LENGTH,
ma.mime_string as MIME_STRING_A,
mb.mime_string as MIME_STRING_B,
@@ -1201,8 +1278,16 @@
<sql>
select file_path,
- pa.file_name as FILE_NAME_A,
- pb.file_name as FILE_NAME_B,
+ case
+ when pa.embedded_depth > 0
+ then pa.embedded_file_path
+ else pa.file_name
+ end as FILE_NAME_A,
+ case
+ when pb.embedded_depth > 0
+ then pb.embedded_file_path
+ else pb.file_name
+ end as FILE_NAME_B,
c.length as CONTAINER_LENGTH,
ma.mime_string as MIME_STRING_A,
mb.mime_string as MIME_STRING_B,
@@ -1325,8 +1410,16 @@
<sql>
select file_path,
- pa.file_name as FILE_NAME_A,
- pb.file_name as FILE_NAME_B,
+ case
+ when pa.embedded_depth > 0
+ then pa.embedded_file_path
+ else pa.file_name
+ end as FILE_NAME_A,
+ case
+ when pb.embedded_depth > 0
+ then pb.embedded_file_path
+ else pb.file_name
+ end as FILE_NAME_B,
ma.mime_string as mime_string_a,
mb.mime_string as mime_string_b,
pa.num_metadata_values as num_metadata_values_a,
@@ -1371,8 +1464,16 @@
includeSql="true">
<sql>
select file_path,
- pa.file_name as FILE_NAME_A,
- pb.file_name as FILE_NAME_B,
+ case
+ when pa.embedded_depth > 0
+ then pa.embedded_file_path
+ else pa.file_name
+ end as FILE_NAME_A,
+ case
+ when pb.embedded_depth > 0
+ then pb.embedded_file_path
+ else pb.file_name
+ end as FILE_NAME_B,
c.length as CONTAINTER_LENGTH,
ma.mime_string as MIME_STRING_A,
mb.mime_string as MIME_STRING_B,
diff --git a/tika-eval/tika-eval-app/src/main/resources/profile-reports.xml
b/tika-eval/tika-eval-app/src/main/resources/profile-reports.xml
index d4636f466..f239e92ba 100644
--- a/tika-eval/tika-eval-app/src/main/resources/profile-reports.xml
+++ b/tika-eval/tika-eval-app/src/main/resources/profile-reports.xml
@@ -117,7 +117,13 @@
includeSql="true">
<!-- 0.50 is a complete heuristic -->
<sql>
- select file_path, file_name, is_embedded,
+ select file_path,
+ case
+ when p.embedded_depth > 0
+ then p.embedded_file_path
+ else p.file_name
+ end as FILE_NAME,
+ is_embedded,
mime_string, lang_id_1, common_tokens_lang,
num_tokens, num_alphabetic_tokens, num_common_tokens,
case
@@ -247,7 +253,12 @@
<sql>
select file_path,
- file_name, is_embedded,
+ case
+ when p.embedded_depth > 0
+ then p.embedded_file_path
+ else p.file_name
+ end as FILE_NAME,
+ is_embedded,
c.length as CONTAINER_LENGTH,
mime_string as MIME_TYPE,
orig_stack_trace, sort_stack_trace