This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 33c21a3a4 TIKA-3812 add example of how to configure gdal
programmatically
33c21a3a4 is described below
commit 33c21a3a4c9b4805908600083786eb11c127fd94
Author: tallison <[email protected]>
AuthorDate: Wed Oct 5 06:52:27 2022 -0400
TIKA-3812 add example of how to configure gdal programmatically
---
.../java/org/apache/tika/parser/ocr/TestOCR.java | 31 ++++++++++++++++++++++
1 file changed, 31 insertions(+)
diff --git
a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/ocr/TestOCR.java
b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/ocr/TestOCR.java
index f11ede9bf..5655bd72d 100644
---
a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/ocr/TestOCR.java
+++
b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/ocr/TestOCR.java
@@ -21,8 +21,11 @@ import static org.junit.jupiter.api.Assumptions.assumeTrue;
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
@@ -36,6 +39,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.gdal.GDALParser;
@@ -60,6 +64,33 @@ public class TestOCR extends TikaTest {
assertContains("file contains",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
}
+ @Test
+ public void testPNGProgrammatically() throws Exception {
+ //remove the GDAL parser from the default parser
+ Parser defaultParser = new DefaultParser();
+ List<Parser> parsers = new ArrayList<>();
+ for (Parser p :
((CompositeParser)defaultParser).getAllComponentParsers()) {
+ if (! (p instanceof GDALParser)) {
+ parsers.add(p);
+ }
+ }
+
+ //decorate the gdal parser to exclude these image formats
+ Set<MediaType> exclude = new HashSet<>();
+ exclude.add(MediaType.image("png"));
+ exclude.add(MediaType.image("jpeg"));
+ exclude.add(MediaType.image("bmp"));
+ exclude.add(MediaType.image("gif"));
+
+ Parser specialGDAL = ParserDecorator.withoutTypes(new GDALParser(),
exclude);
+ parsers.add(specialGDAL);
+
+ Parser autoDetect = new AutoDetectParser(parsers.toArray(new
Parser[0]));
+ List<Metadata> metadataList = getRecursiveMetadata("testOCR.png",
autoDetect);
+ assertContains("file contains",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+
+ }
+
@Test
public void testOthers() throws Exception {
Parser p = loadParser();