This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 14341da05 TIKA-3796 -- fix bug that prevented configuration of 
includeHeadersAndFooters in AbstractOfficeParser via tika-config.
14341da05 is described below

commit 14341da0510e5019b4c0c2f04542bee16372c63e
Author: tallison <[email protected]>
AuthorDate: Mon Jun 20 16:13:15 2022 -0400

    TIKA-3796 -- fix bug that prevented configuration of 
includeHeadersAndFooters in AbstractOfficeParser via tika-config.
---
 .../parser/microsoft/AbstractOfficeParser.java     |  5 ++
 .../tika/parser/microsoft/ExcelParserTest.java     | 54 +++++++++++++---------
 .../tika/parser/microsoft/OfficeParserTest.java    |  1 -
 .../tika/parser/microsoft/WordParserTest.java      |  9 ++++
 .../parser/microsoft/ooxml/OOXMLParserTest.java    | 29 ++++++++++++
 .../microsoft/tika-config-headers-footers.xml      | 32 +++++++++++++
 6 files changed, 107 insertions(+), 23 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
index 29347d79c..461346f0f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -149,4 +149,9 @@ public abstract class AbstractOfficeParser extends 
AbstractParser {
     public void setDateFormatOverride(String format) {
         defaultOfficeParserConfig.setDateOverrideFormat(format);
     }
+
+    @Field
+    public void setIncludeHeadersAndFooters(boolean includeHeadersAndFooters) {
+        
defaultOfficeParserConfig.setIncludeHeadersAndFooters(includeHeadersAndFooters);
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 53cc1acea..600194407 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -414,31 +414,41 @@ public class ExcelParserTest extends TikaTest {
 
     @Test
     public void testHeaderAndFooterNotExtraction() throws Exception {
-        try (InputStream input = getResourceAsStream(
-                "/test-documents/testEXCEL_headers_footers.xls")) {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            ParseContext context = new ParseContext();
-            context.set(Locale.class, Locale.UK);
-
-            OfficeParserConfig officeParserConfig = new OfficeParserConfig();
-            officeParserConfig.setIncludeHeadersAndFooters(false);
-            context.set(OfficeParserConfig.class, officeParserConfig);
-            new OfficeParser().parse(input, handler, metadata, context);
+        ParseContext context = new ParseContext();
+        context.set(Locale.class, Locale.UK);
 
-            assertEquals("application/vnd.ms-excel", 
metadata.get(Metadata.CONTENT_TYPE));
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setIncludeHeadersAndFooters(false);
+        context.set(OfficeParserConfig.class, officeParserConfig);
 
-            String content = handler.toString();
-            assertContains("John Smith1", content);
-            assertContains("John Smith50", content);
-            assertContains("1 Corporate HQ", content);
-            assertNotContained("Header - Corporate Spreadsheet", content);
-            assertNotContained("Header - For Internal Use Only", content);
-            assertNotContained("Header - Author: John Smith", content);
-            assertNotContained("Footer - Corporate Spreadsheet", content);
-            assertNotContained("Footer - For Internal Use Only", content);
-            assertNotContained("Footer - Author: John Smith", content);
+        XMLResult xmlResult = getXML("testEXCEL_headers_footers.xls", context);
+        assertEquals("application/vnd.ms-excel", 
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        String content = xmlResult.xml;
+        assertContains("John Smith1", content);
+        assertContains("John Smith50", content);
+        assertContains("1 Corporate HQ", content);
+        assertNotContained("Header - Corporate Spreadsheet", content);
+        assertNotContained("Header - For Internal Use Only", content);
+        assertNotContained("Header - Author: John Smith", content);
+        assertNotContained("Footer - Corporate Spreadsheet", content);
+        assertNotContained("Footer - For Internal Use Only", content);
+        assertNotContained("Footer - Author: John Smith", content);
+
+        //now test configuration via tika-config
+        Parser configuredParser = null;
+        try (InputStream is = 
getResourceAsStream("tika-config-headers-footers.xml")) {
+            configuredParser = new AutoDetectParser(new TikaConfig(is));
         }
+        content = getXML("testEXCEL_headers_footers.xls", 
configuredParser).xml;
+        assertContains("John Smith1", content);
+        assertContains("John Smith50", content);
+        assertContains("1 Corporate HQ", content);
+        assertNotContained("Header - Corporate Spreadsheet", content);
+        assertNotContained("Header - For Internal Use Only", content);
+        assertNotContained("Header - Author: John Smith", content);
+        assertNotContained("Footer - Corporate Spreadsheet", content);
+        assertNotContained("Footer - For Internal Use Only", content);
+        assertNotContained("Footer - Author: John Smith", content);
     }
 
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
index 71b42ffa0..129fed630 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
@@ -27,7 +27,6 @@ import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.Parser;
 
-
 public class OfficeParserTest extends TikaTest {
 
     @Test
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 6f6a2e7e5..51d574e82 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -43,6 +43,7 @@ import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 
 public class WordParserTest extends TikaTest {
@@ -297,6 +298,14 @@ public class WordParserTest extends TikaTest {
         String xml = getXML("testWORD_various.doc", parseContext).xml;
         assertNotContained("This is the header text.", xml);
         assertNotContained("This is the footer text.", xml);
+
+        Parser configuredParser = null;
+        try (InputStream is = 
getResourceAsStream("tika-config-headers-footers.xml")) {
+            configuredParser = new AutoDetectParser(new TikaConfig(is));
+        }
+        xml = getXML("testWORD_various.doc", configuredParser).xml;
+        assertNotContained("This is the header text.", xml);
+        assertNotContained("This is the footer text.", xml);
     }
 
     /**
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 738d9a051..214953aa6 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -73,6 +73,7 @@ import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.microsoft.OfficeParser;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.OfficeParserTest;
 import org.apache.tika.sax.BodyContentHandler;
 
 public class OOXMLParserTest extends MultiThreadedTikaTest {
@@ -613,6 +614,18 @@ public class OOXMLParserTest extends MultiThreadedTikaTest 
{
         String xml = getXML("testWORD_various.docx", parseContext).xml;
         assertNotContained("This is the header text.", xml);
         assertNotContained("This is the footer text.", xml);
+
+        //now test configuration via tika-config
+        Parser configuredParser = null;
+        try (InputStream is =
+                     OfficeParserTest.class.getResourceAsStream(
+                             "tika-config-headers-footers.xml")) {
+            configuredParser = new AutoDetectParser(new TikaConfig(is));
+        }
+        xml = getXML("testWORD_various.docx", configuredParser).xml;
+        assertNotContained("This is the header text.", xml);
+        assertNotContained("This is the footer text.", xml);
+
     }
 
     @Test
@@ -1201,6 +1214,22 @@ public class OOXMLParserTest extends 
MultiThreadedTikaTest {
         assertNotContained("Footer - Corporate Spreadsheet", content);
         assertNotContained("Footer - For Internal Use Only", content);
         assertNotContained("Footer - Author: John Smith", content);
+
+        //now test configuration via tika-config
+        Parser configuredParser = null;
+        try (InputStream is = 
OfficeParserTest.class.getResourceAsStream("tika-config-headers-footers.xml")) {
+            configuredParser = new AutoDetectParser(new TikaConfig(is));
+        }
+        content = getXML("testEXCEL_headers_footers.xlsx", 
configuredParser).xml;
+        assertContains("John Smith1", content);
+        assertContains("John Smith50", content);
+        assertContains("1 Corporate HQ", content);
+        assertNotContained("Header - Corporate Spreadsheet", content);
+        assertNotContained("Header - For Internal Use Only", content);
+        assertNotContained("Header - Author: John Smith", content);
+        assertNotContained("Footer - Corporate Spreadsheet", content);
+        assertNotContained("Footer - For Internal Use Only", content);
+        assertNotContained("Footer - Author: John Smith", content);
     }
 
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/tika-config-headers-footers.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/tika-config-headers-footers.xml
new file mode 100644
index 000000000..9e74e7c40
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/tika-config-headers-footers.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser"/>
+        <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+            <params>
+                <param name="includeHeadersAndFooters" 
type="bool">false</param>
+            </params>
+        </parser>
+        <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+            <params>
+                <param name="includeHeadersAndFooters" 
type="bool">false</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>

Reply via email to