This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4374 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 3f3d69cc32e6066a06dbdd8f8a89a105f4623b32 Author: tallison <[email protected]> AuthorDate: Thu May 8 09:19:23 2025 -0400 TIKA-4374 -- embedded file path names --- .../org/apache/tika/eval/app/AbstractProfiler.java | 4 +- .../src/main/resources/comparison-reports.xml | 155 +++++++++++++++++---- .../src/main/resources/profile-reports.xml | 15 +- 3 files changed, 143 insertions(+), 31 deletions(-) diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java index 7e0a6a3aa..53e99aba6 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java @@ -401,9 +401,9 @@ public abstract class AbstractProfiler extends FileResourceConsumer { data.put(Cols.EMBEDDED_DEPTH, "0"); } else { data.put(Cols.IS_EMBEDDED, TRUE); - String embeddedFilePath = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); + String embeddedFilePath = m.get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH); if (!StringUtils.isBlank(embeddedFilePath)) { - data.put(Cols.FILE_NAME, getFileName(m.get(embeddedFilePath))); + data.put(Cols.FILE_NAME, getFileName(embeddedFilePath)); data.put(Cols.EMBEDDED_FILE_PATH, embeddedFilePath); } if (!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_DEPTH))) { diff --git a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml index 60f694396..667578daa 100644 --- a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml +++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml @@ -454,9 +454,18 @@ <sql> select concat(ma.mime_string, ' -> ', mb.mime_string) as MIME_A_TO_MIME_B, - file_path, a.file_name, - c.length as CONTAINER_LENGTH, - a.file_name + file_path, + case + when a.embedded_depth > 0 + then a.embedded_file_path + else a.file_name + end as FILE_NAME_A, + case + when b.embedded_depth > 0 + then b.embedded_file_path + else b.file_name + end as FILE_NAME_B, + c.length as CONTAINER_LENGTH from profiles_a a join profiles_b b on a.id=b.id join mimes ma on ma.mime_id=a.mime_id @@ -576,7 +585,11 @@ <sql> select file_path as FILE_PATH, - pa.file_name as FILE_NAME_A, + case + when pa.embedded_depth > 0 + then pa.embedded_file_path + else pa.file_name + end as FILE_NAME_A, c.length as CONTAINER_LENGTH, ca.NUM_TOKENS as NUM_TOKENS_A, cb.NUM_TOKENS as NUM_TOKENS_B, @@ -630,12 +643,20 @@ <sql> select file_path, - pa.file_name as FILE_NAME_A, - pb.file_name as FILE_NAME_B, + case + when pa.embedded_depth > 0 + then pa.embedded_file_path + else pa.file_name + end as FILE_NAME_A, + case + when pb.embedded_depth > 0 + then pb.embedded_file_path + else pb.file_name + end as FILE_NAME_B, c.length as CONTAINER_LENGTH, ma.mime_string as MIME_TYPE_A, mb.mime_string as MIME_TYPE_B, - pa.file_name, pa.is_embedded + pa.is_embedded from exceptions_a ea left join exceptions_b eb on ea.id = eb.id join profiles_a pa on pa.id=ea.id @@ -655,8 +676,16 @@ <sql> select file_path, - pa.file_name as FILE_NAME_A, - pb.file_name as FILE_NAME_B, + case + when pa.embedded_depth > 0 + then pa.embedded_file_path + else pa.file_name + end as FILE_NAME_A, + case + when pb.embedded_depth > 0 + then pb.embedded_file_path + else pb.file_name + end as FILE_NAME_B, c.length as CONTAINER_LENGTH, ma.mime_string as MIME_TYPE_A, mb.mime_string as MIME_TYPE_B, @@ -728,8 +757,16 @@ <sql> select file_path, - pa.file_name as FILE_NAME_A, - pb.file_name as FILE_NAME_B, + case + when pa.embedded_depth > 0 + then pa.embedded_file_path + else pa.file_name + end as FILE_NAME_A, + case + when pb.embedded_depth > 0 + then pb.embedded_file_path + else pb.file_name + end as FILE_NAME_B, c.length as CONTAINER_LENGTH, ma.mime_string as MIME_TYPE_A, mb.mime_string as MIME_TYPE_B, @@ -771,7 +808,11 @@ <sql> select file_path, - p.file_name as FILE_NAME_A, + case + when p.embedded_depth > 0 + then p.embedded_file_path + else p.file_name + end as FILE_NAME_A, c.length as CONTAINER_LENGTH, mime_string as MIME_TYPE, orig_stack_trace, sort_stack_trace @@ -791,7 +832,11 @@ <sql> select file_path, - p.file_name as FILE_NAME_B, + case + when p.embedded_depth > 0 + then p.embedded_file_path + else p.file_name + end as FILE_NAME_B, c.length as CONTAINER_LENGTH, mime_string as MIME_TYPE, orig_stack_trace, sort_stack_trace @@ -949,8 +994,16 @@ <sql> select file_path, - pa.file_name as FILE_NAME_A, - pb.file_name as FILE_NAME_B, + case + when pa.embedded_depth > 0 + then pa.embedded_file_path + else pa.file_name + end as FILE_NAME_A, + case + when pb.embedded_depth > 0 + then pb.embedded_file_path + else pb.file_name + end as FILE_NAME_B, c.length as CONTAINER_LENGTH, ma.mime_string as MIME_STRING_A, mb.mime_string as MIME_STRING_B, @@ -1001,8 +1054,16 @@ <sql> select file_path, - pa.file_name as FILE_NAME_A, - pb.file_name as FILE_NAME_B, + case + when pa.embedded_depth > 0 + then pa.embedded_file_path + else pa.file_name + end as FILE_NAME_A, + case + when pb.embedded_depth > 0 + then pb.embedded_file_path + else pb.file_name + end as FILE_NAME_B, c.length as CONTAINER_LENGTH, ma.mime_string as MIME_STRING_A, mb.mime_string as MIME_STRING_B, @@ -1065,8 +1126,16 @@ <sql> select file_path, - pa.file_name as FILE_NAME_A, - pb.file_name as FILE_NAME_B, + case + when pa.embedded_depth > 0 + then pa.embedded_file_path + else pa.file_name + end as FILE_NAME_A, + case + when pb.embedded_depth > 0 + then pb.embedded_file_path + else pb.file_name + end as FILE_NAME_B, c.length as CONTAINER_LENGTH, ma.mime_string as MIME_STRING_A, mb.mime_string as MIME_STRING_B, @@ -1170,8 +1239,16 @@ <sql> select file_path, - pa.file_name as FILE_NAME_A, - pb.file_name as FILE_NAME_B, + case + when pa.embedded_depth > 0 + then pa.embedded_file_path + else pa.file_name + end as FILE_NAME_A, + case + when pb.embedded_depth > 0 + then pb.embedded_file_path + else pb.file_name + end as FILE_NAME_B, c.length as CONTAINER_LENGTH, ma.mime_string as MIME_STRING_A, mb.mime_string as MIME_STRING_B, @@ -1201,8 +1278,16 @@ <sql> select file_path, - pa.file_name as FILE_NAME_A, - pb.file_name as FILE_NAME_B, + case + when pa.embedded_depth > 0 + then pa.embedded_file_path + else pa.file_name + end as FILE_NAME_A, + case + when pb.embedded_depth > 0 + then pb.embedded_file_path + else pb.file_name + end as FILE_NAME_B, c.length as CONTAINER_LENGTH, ma.mime_string as MIME_STRING_A, mb.mime_string as MIME_STRING_B, @@ -1325,8 +1410,16 @@ <sql> select file_path, - pa.file_name as FILE_NAME_A, - pb.file_name as FILE_NAME_B, + case + when pa.embedded_depth > 0 + then pa.embedded_file_path + else pa.file_name + end as FILE_NAME_A, + case + when pb.embedded_depth > 0 + then pb.embedded_file_path + else pb.file_name + end as FILE_NAME_B, ma.mime_string as mime_string_a, mb.mime_string as mime_string_b, pa.num_metadata_values as num_metadata_values_a, @@ -1371,8 +1464,16 @@ includeSql="true"> <sql> select file_path, - pa.file_name as FILE_NAME_A, - pb.file_name as FILE_NAME_B, + case + when pa.embedded_depth > 0 + then pa.embedded_file_path + else pa.file_name + end as FILE_NAME_A, + case + when pb.embedded_depth > 0 + then pb.embedded_file_path + else pb.file_name + end as FILE_NAME_B, c.length as CONTAINTER_LENGTH, ma.mime_string as MIME_STRING_A, mb.mime_string as MIME_STRING_B, diff --git a/tika-eval/tika-eval-app/src/main/resources/profile-reports.xml b/tika-eval/tika-eval-app/src/main/resources/profile-reports.xml index d4636f466..f239e92ba 100644 --- a/tika-eval/tika-eval-app/src/main/resources/profile-reports.xml +++ b/tika-eval/tika-eval-app/src/main/resources/profile-reports.xml @@ -117,7 +117,13 @@ includeSql="true"> <!-- 0.50 is a complete heuristic --> <sql> - select file_path, file_name, is_embedded, + select file_path, + case + when p.embedded_depth > 0 + then p.embedded_file_path + else p.file_name + end as FILE_NAME, + is_embedded, mime_string, lang_id_1, common_tokens_lang, num_tokens, num_alphabetic_tokens, num_common_tokens, case @@ -247,7 +253,12 @@ <sql> select file_path, - file_name, is_embedded, + case + when p.embedded_depth > 0 + then p.embedded_file_path + else p.file_name + end as FILE_NAME, + is_embedded, c.length as CONTAINER_LENGTH, mime_string as MIME_TYPE, orig_stack_trace, sort_stack_trace
