This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 634f9191f TIKA-3812 -- fix unit test to confirm plain png and jpeg
work with config file
634f9191f is described below
commit 634f9191f1a1f3cd21a5ff4311af249663567716
Author: tallison <[email protected]>
AuthorDate: Tue Oct 4 12:27:13 2022 -0400
TIKA-3812 -- fix unit test to confirm plain png and jpeg work with config
file
---
.../pom.xml | 6 +++
.../java/org/apache/tika/parser/ocr/TestOCR.java | 48 +++++++++++++++++++++-
.../config/tika-config-restricted-gdal.xml | 32 +++++++++++++++
3 files changed, 84 insertions(+), 2 deletions(-)
diff --git
a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/pom.xml
b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/pom.xml
index ede345235..dfc679ec7 100644
---
a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/pom.xml
+++
b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/pom.xml
@@ -35,6 +35,12 @@
<version>${project.version}</version>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-scientific-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-parser-sqlite3-package</artifactId>
diff --git
a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/ocr/TestOCR.java
b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/ocr/TestOCR.java
index e466a34ca..f11ede9bf 100644
---
a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/ocr/TestOCR.java
+++
b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/ocr/TestOCR.java
@@ -16,16 +16,29 @@
*/
package org.apache.tika.parser.ocr;
+import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assumptions.assumeTrue;
+import java.io.IOException;
+import java.io.InputStream;
import java.util.List;
+import java.util.Map;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
+import org.xml.sax.SAXException;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.parser.gdal.GDALParser;
public class TestOCR extends TikaTest {
@@ -37,13 +50,44 @@ public class TestOCR extends TikaTest {
@Test
public void testJPEG() throws Exception {
- List<Metadata> metadataList = getRecursiveMetadata("testOCR.jpg");
+ List<Metadata> metadataList = getRecursiveMetadata("testOCR.jpg",
loadParser());
assertContains("OCR Testing",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
}
@Test
public void testPNG() throws Exception {
- List<Metadata> metadataList = getRecursiveMetadata("testOCR.png");
+ List<Metadata> metadataList = getRecursiveMetadata("testOCR.png",
loadParser());
assertContains("file contains",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
}
+
+ @Test
+ public void testOthers() throws Exception {
+ Parser p = loadParser();
+ if (p instanceof CompositeParser) {
+ Map<MediaType, Parser> parsers = ((CompositeParser)p).getParsers();
+ Class clz = getParser(MediaType.application("x-netcdf"), parsers);
+ assertEquals(GDALParser.class, clz);
+ }
+ }
+
+ private Class getParser(MediaType mediaType, Map<MediaType, Parser>
parsers) {
+ //this is fragile, but works well enough for a unit test
+ Parser p = parsers.get(mediaType);
+ if (p instanceof CompositeParser) {
+ return getParser(mediaType, ((CompositeParser)p).getParsers());
+ } else if (p instanceof ParserDecorator) {
+ Parser decorated = ((ParserDecorator)p).getWrappedParser();
+ return decorated.getClass();
+ }
+ return p.getClass();
+ }
+
+ private Parser loadParser() throws IOException, TikaException,
SAXException {
+ try (InputStream is = TestOCR.class.getResourceAsStream(
+ "/config/tika-config-restricted-gdal.xml")) {
+ TikaConfig tikaConfig = new TikaConfig(is);
+ return new AutoDetectParser(tikaConfig);
+ }
+ }
+
}
diff --git
a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/resources/config/tika-config-restricted-gdal.xml
b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/resources/config/tika-config-restricted-gdal.xml
new file mode 100644
index 000000000..5e6fe2461
--- /dev/null
+++
b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/resources/config/tika-config-restricted-gdal.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <!-- don't load this here, load it later -->
+ <parser-exclude class="org.apache.tika.parser.gdal.GDALParser"/>
+ </parser>
+ <!-- this prevents the GDALParser from parsing these file formats -->
+ <parser class="org.apache.tika.parser.gdal.GDALParser">
+ <mime-exclude>image/jpeg</mime-exclude>
+ <mime-exclude>image/png</mime-exclude>
+ <mime-exclude>image/jp2</mime-exclude>
+ <mime-exclude>image/gif</mime-exclude>
+ </parser>
+ </parsers>
+</properties>