This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 14341da05 TIKA-3796 -- fix bug that prevented configuration of
includeHeadersAndFooters in AbstractOfficeParser via tika-config.
14341da05 is described below
commit 14341da0510e5019b4c0c2f04542bee16372c63e
Author: tallison <[email protected]>
AuthorDate: Mon Jun 20 16:13:15 2022 -0400
TIKA-3796 -- fix bug that prevented configuration of
includeHeadersAndFooters in AbstractOfficeParser via tika-config.
---
.../parser/microsoft/AbstractOfficeParser.java | 5 ++
.../tika/parser/microsoft/ExcelParserTest.java | 54 +++++++++++++---------
.../tika/parser/microsoft/OfficeParserTest.java | 1 -
.../tika/parser/microsoft/WordParserTest.java | 9 ++++
.../parser/microsoft/ooxml/OOXMLParserTest.java | 29 ++++++++++++
.../microsoft/tika-config-headers-footers.xml | 32 +++++++++++++
6 files changed, 107 insertions(+), 23 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
index 29347d79c..461346f0f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -149,4 +149,9 @@ public abstract class AbstractOfficeParser extends
AbstractParser {
public void setDateFormatOverride(String format) {
defaultOfficeParserConfig.setDateOverrideFormat(format);
}
+
+ @Field
+ public void setIncludeHeadersAndFooters(boolean includeHeadersAndFooters) {
+
defaultOfficeParserConfig.setIncludeHeadersAndFooters(includeHeadersAndFooters);
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 53cc1acea..600194407 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -414,31 +414,41 @@ public class ExcelParserTest extends TikaTest {
@Test
public void testHeaderAndFooterNotExtraction() throws Exception {
- try (InputStream input = getResourceAsStream(
- "/test-documents/testEXCEL_headers_footers.xls")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.UK);
-
- OfficeParserConfig officeParserConfig = new OfficeParserConfig();
- officeParserConfig.setIncludeHeadersAndFooters(false);
- context.set(OfficeParserConfig.class, officeParserConfig);
- new OfficeParser().parse(input, handler, metadata, context);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.UK);
- assertEquals("application/vnd.ms-excel",
metadata.get(Metadata.CONTENT_TYPE));
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setIncludeHeadersAndFooters(false);
+ context.set(OfficeParserConfig.class, officeParserConfig);
- String content = handler.toString();
- assertContains("John Smith1", content);
- assertContains("John Smith50", content);
- assertContains("1 Corporate HQ", content);
- assertNotContained("Header - Corporate Spreadsheet", content);
- assertNotContained("Header - For Internal Use Only", content);
- assertNotContained("Header - Author: John Smith", content);
- assertNotContained("Footer - Corporate Spreadsheet", content);
- assertNotContained("Footer - For Internal Use Only", content);
- assertNotContained("Footer - Author: John Smith", content);
+ XMLResult xmlResult = getXML("testEXCEL_headers_footers.xls", context);
+ assertEquals("application/vnd.ms-excel",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+ String content = xmlResult.xml;
+ assertContains("John Smith1", content);
+ assertContains("John Smith50", content);
+ assertContains("1 Corporate HQ", content);
+ assertNotContained("Header - Corporate Spreadsheet", content);
+ assertNotContained("Header - For Internal Use Only", content);
+ assertNotContained("Header - Author: John Smith", content);
+ assertNotContained("Footer - Corporate Spreadsheet", content);
+ assertNotContained("Footer - For Internal Use Only", content);
+ assertNotContained("Footer - Author: John Smith", content);
+
+ //now test configuration via tika-config
+ Parser configuredParser = null;
+ try (InputStream is =
getResourceAsStream("tika-config-headers-footers.xml")) {
+ configuredParser = new AutoDetectParser(new TikaConfig(is));
}
+ content = getXML("testEXCEL_headers_footers.xls",
configuredParser).xml;
+ assertContains("John Smith1", content);
+ assertContains("John Smith50", content);
+ assertContains("1 Corporate HQ", content);
+ assertNotContained("Header - Corporate Spreadsheet", content);
+ assertNotContained("Header - For Internal Use Only", content);
+ assertNotContained("Header - Author: John Smith", content);
+ assertNotContained("Footer - Corporate Spreadsheet", content);
+ assertNotContained("Footer - For Internal Use Only", content);
+ assertNotContained("Footer - Author: John Smith", content);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
index 71b42ffa0..129fed630 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
@@ -27,7 +27,6 @@ import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
-
public class OfficeParserTest extends TikaTest {
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 6f6a2e7e5..51d574e82 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -43,6 +43,7 @@ import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
public class WordParserTest extends TikaTest {
@@ -297,6 +298,14 @@ public class WordParserTest extends TikaTest {
String xml = getXML("testWORD_various.doc", parseContext).xml;
assertNotContained("This is the header text.", xml);
assertNotContained("This is the footer text.", xml);
+
+ Parser configuredParser = null;
+ try (InputStream is =
getResourceAsStream("tika-config-headers-footers.xml")) {
+ configuredParser = new AutoDetectParser(new TikaConfig(is));
+ }
+ xml = getXML("testWORD_various.doc", configuredParser).xml;
+ assertNotContained("This is the header text.", xml);
+ assertNotContained("This is the footer text.", xml);
}
/**
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 738d9a051..214953aa6 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -73,6 +73,7 @@ import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.OfficeParserTest;
import org.apache.tika.sax.BodyContentHandler;
public class OOXMLParserTest extends MultiThreadedTikaTest {
@@ -613,6 +614,18 @@ public class OOXMLParserTest extends MultiThreadedTikaTest
{
String xml = getXML("testWORD_various.docx", parseContext).xml;
assertNotContained("This is the header text.", xml);
assertNotContained("This is the footer text.", xml);
+
+ //now test configuration via tika-config
+ Parser configuredParser = null;
+ try (InputStream is =
+ OfficeParserTest.class.getResourceAsStream(
+ "tika-config-headers-footers.xml")) {
+ configuredParser = new AutoDetectParser(new TikaConfig(is));
+ }
+ xml = getXML("testWORD_various.docx", configuredParser).xml;
+ assertNotContained("This is the header text.", xml);
+ assertNotContained("This is the footer text.", xml);
+
}
@Test
@@ -1201,6 +1214,22 @@ public class OOXMLParserTest extends
MultiThreadedTikaTest {
assertNotContained("Footer - Corporate Spreadsheet", content);
assertNotContained("Footer - For Internal Use Only", content);
assertNotContained("Footer - Author: John Smith", content);
+
+ //now test configuration via tika-config
+ Parser configuredParser = null;
+ try (InputStream is =
OfficeParserTest.class.getResourceAsStream("tika-config-headers-footers.xml")) {
+ configuredParser = new AutoDetectParser(new TikaConfig(is));
+ }
+ content = getXML("testEXCEL_headers_footers.xlsx",
configuredParser).xml;
+ assertContains("John Smith1", content);
+ assertContains("John Smith50", content);
+ assertContains("1 Corporate HQ", content);
+ assertNotContained("Header - Corporate Spreadsheet", content);
+ assertNotContained("Header - For Internal Use Only", content);
+ assertNotContained("Header - Author: John Smith", content);
+ assertNotContained("Footer - Corporate Spreadsheet", content);
+ assertNotContained("Footer - For Internal Use Only", content);
+ assertNotContained("Footer - Author: John Smith", content);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/tika-config-headers-footers.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/tika-config-headers-footers.xml
new file mode 100644
index 000000000..9e74e7c40
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/tika-config-headers-footers.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+ <params>
+ <param name="includeHeadersAndFooters"
type="bool">false</param>
+ </params>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+ <params>
+ <param name="includeHeadersAndFooters"
type="bool">false</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>