This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 5a3a7d2bb TIKA-4352 -- add an exclusion list in the 
StandardWriteFilter (#2046)
5a3a7d2bb is described below

commit 5a3a7d2bb434de6ef650c950e2d90d005f388f75
Author: Tim Allison <[email protected]>
AuthorDate: Thu Nov 14 15:24:45 2024 -0500

    TIKA-4352 -- add an exclusion list in the StandardWriteFilter (#2046)
---
 .../metadata/writefilter/StandardWriteFilter.java  | 12 ++++---
 .../writefilter/StandardWriteFilterFactory.java    | 12 +++++--
 .../writefilter/StandardWriteFilterTest.java       | 42 ++++++++++++++++++----
 .../org/apache/tika/config/TIKA-3695-exclude.xml   | 35 ++++++++++++++++++
 4 files changed, 88 insertions(+), 13 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
 
b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
index f0e9f1fe6..a245e8d2c 100644
--- 
a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
+++ 
b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
@@ -113,6 +113,7 @@ public class StandardWriteFilter implements 
MetadataWriteFilter, Serializable {
 
 
     private final Set<String> includeFields;
+    private final Set<String> excludeFields;
 
     private Map<String, Integer> fieldSizes = new HashMap<>();
 
@@ -125,12 +126,14 @@ public class StandardWriteFilter implements 
MetadataWriteFilter, Serializable {
      * @param maxEstimatedSize
      * @param includeFields if null or empty, all fields are included; 
otherwise, which fields
      *                      to add to the metadata object.
+     * @param excludeFields these fields will not be included (unless they're 
in {@link StandardWriteFilter#ALWAYS_SET_FIELDS})
      * @param includeEmpty if <code>true</code>, this will set or add an empty 
value to the
      *                     metadata object.
      */
     protected StandardWriteFilter(int maxKeySize, int maxFieldSize, int 
maxEstimatedSize,
                                int maxValuesPerField,
                                Set<String> includeFields,
+                               Set<String> excludeFields,
                                boolean includeEmpty) {
 
         this.maxKeySize = maxKeySize;
@@ -138,6 +141,7 @@ public class StandardWriteFilter implements 
MetadataWriteFilter, Serializable {
         this.maxTotalEstimatedSize = maxEstimatedSize;
         this.maxValuesPerField = maxValuesPerField;
         this.includeFields = includeFields;
+        this.excludeFields = excludeFields;
         this.includeEmpty = includeEmpty;
     }
 
@@ -176,6 +180,7 @@ public class StandardWriteFilter implements 
MetadataWriteFilter, Serializable {
             setAlwaysInclude(field, value, data);
             return;
         }
+
         StringSizePair filterKey = filterKey(field, value, data);
         setFilterKey(filterKey, value, data);
     }
@@ -433,11 +438,10 @@ public class StandardWriteFilter implements 
MetadataWriteFilter, Serializable {
         if (ALWAYS_SET_FIELDS.contains(name)) {
             return true;
         }
-        if (includeFields == null ||
-                includeFields.contains(name)) {
-            return true;
+        if (excludeFields.contains(name)) {
+            return false;
         }
-        return false;
+        return includeFields.isEmpty() || includeFields.contains(name);
     }
 
     private static int estimateSize(String s) {
diff --git 
a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
 
b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
index b7d60b540..df6d8b42d 100644
--- 
a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
+++ 
b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
@@ -33,7 +33,8 @@ public class StandardWriteFilterFactory implements 
MetadataWriteFilterFactory {
     public static int DEFAULT_TOTAL_ESTIMATED_BYTES = 10 * 1024 * 1024;
     public static int DEFAULT_MAX_VALUES_PER_FIELD = 10;
 
-    private Set<String> includeFields = null;
+    private Set<String> includeFields = Collections.EMPTY_SET;
+    private Set<String> excludeFields = Collections.EMPTY_SET;
     private int maxKeySize = DEFAULT_MAX_KEY_SIZE;
     private int maxFieldSize = DEFAULT_MAX_FIELD_SIZE;
     private int maxTotalEstimatedBytes = DEFAULT_TOTAL_ESTIMATED_BYTES;
@@ -55,7 +56,8 @@ public class StandardWriteFilterFactory implements 
MetadataWriteFilterFactory {
         }
 
         return new StandardWriteFilter(maxKeySize, maxFieldSize,
-                maxTotalEstimatedBytes, maxValuesPerField, includeFields, 
includeEmpty);
+                maxTotalEstimatedBytes, maxValuesPerField, includeFields,
+                excludeFields, includeEmpty);
     }
 
     public void setIncludeFields(List<String> includeFields) {
@@ -64,6 +66,12 @@ public class StandardWriteFilterFactory implements 
MetadataWriteFilterFactory {
         this.includeFields = Collections.unmodifiableSet(keys);
     }
 
+    public void setExcludeFields(List<String> excludeFields) {
+        Set<String> keys = ConcurrentHashMap.newKeySet(excludeFields.size());
+        keys.addAll(excludeFields);
+        this.excludeFields = Collections.unmodifiableSet(keys);
+    }
+
     public void setMaxTotalEstimatedBytes(int maxTotalEstimatedBytes) {
         this.maxTotalEstimatedBytes = maxTotalEstimatedBytes;
     }
diff --git 
a/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
 
b/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
index 7b7e8710d..7c3369bfd 100644
--- 
a/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
+++ 
b/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
@@ -21,6 +21,7 @@ import static org.junit.jupiter.api.Assertions.assertNull;
 
 import java.io.ByteArrayInputStream;
 import java.nio.charset.StandardCharsets;
+import java.util.Collections;
 import java.util.List;
 import java.util.Set;
 
@@ -116,7 +117,7 @@ public class StandardWriteFilterTest extends TikaTest {
     @Test
     public void testKeySizeFilter() throws Exception {
         Metadata metadata = filter(10, 1000, 10000, 100,
-                null, true);
+                Collections.EMPTY_SET, Collections.EMPTY_SET, true);
         //test that must add keys are not truncated
         metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser1");
         metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser2");
@@ -138,13 +139,13 @@ public class StandardWriteFilterTest extends TikaTest {
         String k = "dc:creator";//20 bytes
         //key is > maxTotalBytes, so the value isn't even added
         Metadata metadata = filter(100, 10000, 10,
-                100, null, false);
+                100, Collections.EMPTY_SET, Collections.EMPTY_SET, false);
         metadata.set(k, "ab");
         assertEquals(1, metadata.names().length);
         assertEquals("true", 
metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
 
         metadata = filter(100, 10000, 50, 100,
-                null, false);
+                Collections.EMPTY_SET, Collections.EMPTY_SET, false);
         for (int i = 0; i < 10; i++) {
             metadata.set(k, "abcde");
         }
@@ -178,7 +179,8 @@ public class StandardWriteFilterTest extends TikaTest {
     @Test
     public void testMinSizeForAlwaysInclude() throws Exception {
         //test that mimes don't get truncated
-        Metadata metadata = filter(100, 10, 10000, 100, null, true);
+        Metadata metadata = filter(100, 10, 10000, 100,
+                Collections.EMPTY_SET, Collections.EMPTY_SET, true);
 
         String mime = getLongestMime().toString();
         metadata.set(Metadata.CONTENT_TYPE, mime);
@@ -192,21 +194,47 @@ public class StandardWriteFilterTest extends TikaTest {
 
     @Test
     public void testMaxFieldValues() throws Exception {
-        Metadata metadata = filter(100, 10000, 10000, 3, null, true);
+        Metadata metadata = filter(100, 10000, 10000, 3,
+                Collections.EMPTY_SET, Collections.EMPTY_SET, true);
         for (int i = 0; i < 10; i++) {
             metadata.add(TikaCoreProperties.SUBJECT, "ab");
         }
         assertEquals(3, metadata.getValues(TikaCoreProperties.SUBJECT).length);
     }
 
+    @Test
+    public void testExclude() throws Exception {
+        TikaConfig tikaConfig =
+                new 
TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695-exclude.xml"));
+        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+        String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
+                "<mock>";
+        mock += "<metadata action=\"add\" 
name=\"dc:creator\">01234567890123456789</metadata>";
+        mock += "<metadata action=\"add\" 
name=\"subject\">01234567890123456789</metadata>";
+        mock += "<metadata action=\"add\" 
name=\"subjectB\">01234567890123456789</metadata>";
+        mock += "<write element=\"p\" times=\"1\"> hello </write>\n";
+        mock += "</mock>";
+        Metadata metadata = new Metadata();
+        List<Metadata> metadataList =
+                getRecursiveMetadata(new 
ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
+                        parser, metadata, new ParseContext(), true);
+        assertEquals(1, metadataList.size());
+        metadata = metadataList.get(0);
+        assertEquals(9, metadata.names().length);
+        assertEquals("01234567890123456789", metadata.get("dc:creator"));
+        assertEquals("01234567890123456789", metadata.get("subjectB"));
+        assertNull(metadata.get("subject"));
+    }
+
+
     private void assertTruncated(Metadata metadata) {
         assertEquals("true", 
metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
     }
     private Metadata filter(int maxKeySize, int maxFieldSize, int 
maxTotalBytes,
                             int maxValuesPerField,
-                            Set<String> includeFields, boolean includeEmpty) {
+                            Set<String> includeFields, Set<String> 
excludeFields, boolean includeEmpty) {
         MetadataWriteFilter filter = new StandardWriteFilter(maxKeySize, 
maxFieldSize,
-                maxTotalBytes, maxValuesPerField, includeFields, includeEmpty);
+                maxTotalBytes, maxValuesPerField, includeFields, 
excludeFields, includeEmpty);
         Metadata metadata = new Metadata();
         metadata.setMetadataWriteFilter(filter);
         return metadata;
diff --git 
a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-exclude.xml 
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-exclude.xml
new file mode 100644
index 000000000..974b43678
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-exclude.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser"/>
+  </parsers>
+  <autoDetectParserConfig>
+    <params>
+      <spoolToDisk>12345</spoolToDisk>
+      <outputThreshold>6789</outputThreshold>
+    </params>
+    <metadataWriteFilterFactory 
class="org.apache.tika.metadata.writefilter.StandardWriteFilterFactory">
+      <params>
+        <excludeFields>
+          <field>subject</field>
+        </excludeFields>
+      </params>
+    </metadataWriteFilterFactory>
+  </autoDetectParserConfig>
+</properties>

Reply via email to