This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 04225d2834104c973e6cff421c283af876b2e398 Author: tballison <[email protected]> AuthorDate: Thu Mar 29 13:49:59 2018 -0400 TIKA-2621 -- add support for brotli --- .../org/apache/tika/mime/tika-mimetypes.xml | 14 ++++ .../src/test/java/org/apache/tika/TikaTest.java | 14 ++++ tika-parsers/pom.xml | 7 ++ .../apache/tika/parser/pkg/CompressorParser.java | 82 +++++++++++++++++++-- .../tika/parser/pkg/CompressorParserTest.java | 22 +++++- .../test-documents/testBROTLI_compressed.br | Bin 0 -> 12 bytes 6 files changed, 130 insertions(+), 9 deletions(-) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 346eb73..634d9d1 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -3135,6 +3135,20 @@ <match value="bplist" type="string" offset="0"/> </magic> </mime-type> + <mime-type type="application/x-gtar"> + <_comment>GNU tar Compressed File Archive (GNU Tape Archive)</_comment> + <magic priority="50"> + <!-- GNU tar archive --> + <match value="ustar \0" type="string" offset="257" /> + </magic> + <glob pattern="*.gtar"/> + <sub-class-of type="application/x-tar"/> + </mime-type> + + <mime-type type="application/x-brotli"> + <glob pattern="*.br" /> + <glob pattern="*.brotli" /> + </mime-type> <mime-type type="application/x-bzip"> <magic priority="40"> diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index 153a564..9c827f7 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -213,6 +213,20 @@ public abstract class TikaTest { return getRecursiveMetadata(filePath, new ParseContext()); } + protected List<Metadata> getRecursiveMetadata(String filePath, Metadata metadata) throws Exception { + return getRecursiveMetadata(filePath, new ParseContext(), metadata); + } + + protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception { + Parser p = new AutoDetectParser(); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { + wrapper.parse(is, new DefaultHandler(), metadata, context); + } + return wrapper.getMetadata(); + } + protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception { Parser p = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index a3e9e4d..e6c7720 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -40,6 +40,8 @@ <codec.version>1.10</codec.version> <!-- NOTE: sync tukaani version with commons-compress in tika-parent--> <tukaani.version>1.8</tukaani.version> + <!-- NOTE: sync brotli version with commons-compress in tika-parent--> + <brotli.version>0.1.2</brotli.version> <mime4j.version>0.8.1</mime4j.version> <vorbis.version>0.8</vorbis.version> <pdfbox.version>2.0.9</pdfbox.version> @@ -151,6 +153,11 @@ <version>${tukaani.version}</version> </dependency> <dependency> + <groupId>org.brotli</groupId> + <artifactId>dec</artifactId> + <version>${brotli.version}</version> + </dependency> + <dependency> <groupId>com.github.luben</groupId> <artifactId>zstd-jni</artifactId> <version>1.3.3-3</version> diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java index ada7ec9..658d04c 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java @@ -21,6 +21,10 @@ import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; import java.util.Set; import org.apache.commons.compress.MemoryLimitException; @@ -78,9 +82,47 @@ public class CompressorParser extends AbstractParser { private static final MediaType ZSTD = MediaType.application("zstd"); private static final MediaType DEFLATE64= MediaType.application("deflate64"); - private static final Set<MediaType> SUPPORTED_TYPES = - MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS, - XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA, ZSTD); + private static Set<MediaType> SUPPORTED_TYPES; + private static Map<String, String> MIMES_TO_NAME; + + static { + Set<MediaType> TMP_SET = new HashSet<>(); + TMP_SET.addAll( + MediaType.set(BZIP, BZIP2, DEFLATE64, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS, + XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA)); + try { + Class.forName("org.brotli.dec.BrotliInputStream"); + TMP_SET.add(BROTLI); + } catch (NoClassDefFoundError|ClassNotFoundException e) { + //swallow + } + try { + Class.forName("com.github.luben.zstd.ZstdInputStream"); + TMP_SET.add(ZSTD); + } catch (NoClassDefFoundError|ClassNotFoundException e) { + //swallow + } + SUPPORTED_TYPES = Collections.unmodifiableSet(TMP_SET); + } + + static { + //map the mime type strings to the compressor stream names + Map<String, String> tmpMimesToName = new HashMap<>(); + tmpMimesToName.put(BZIP2.toString(), CompressorStreamFactory.BZIP2); + tmpMimesToName.put(GZIP.toString(), CompressorStreamFactory.GZIP); + tmpMimesToName.put(LZ4_FRAMED.toString(), CompressorStreamFactory.LZ4_FRAMED); + tmpMimesToName.put(LZ4_BLOCK.toString(), CompressorStreamFactory.LZ4_BLOCK); + tmpMimesToName.put(XZ.toString(), CompressorStreamFactory.XZ); + tmpMimesToName.put(PACK.toString(), CompressorStreamFactory.PACK200); + tmpMimesToName.put(SNAPPY_FRAMED.toString(), CompressorStreamFactory.SNAPPY_FRAMED); + tmpMimesToName.put(ZLIB.toString(), CompressorStreamFactory.DEFLATE); + tmpMimesToName.put(COMPRESS.toString(), CompressorStreamFactory.Z); + tmpMimesToName.put(LZMA.toString(), CompressorStreamFactory.LZMA); + tmpMimesToName.put(BROTLI.toString(), CompressorStreamFactory.BROTLI); + tmpMimesToName.put(ZSTD.toString(), CompressorStreamFactory.ZSTANDARD); + MIMES_TO_NAME = Collections.unmodifiableMap(tmpMimesToName); + } + private int memoryLimitInKb = 100000;//100MB @@ -181,7 +223,19 @@ public class CompressorParser extends AbstractParser { }); CompressorStreamFactory factory = new CompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb); - cis = factory.createCompressorInputStream(stream); + //if we've already identified it via autodetect + //trust that and go with the appropriate name + //to avoid calling CompressorStreamFactory.detect() twice + String name = getStreamName(metadata); + if (name != null) { + cis = factory.createCompressorInputStream(name, stream); + } else { + cis = factory.createCompressorInputStream(stream); + MediaType type = getMediaType(cis); + if (!type.equals(MediaType.OCTET_STREAM)) { + metadata.set(CONTENT_TYPE, type.toString()); + } + } } catch (CompressorException e) { if (e.getCause() != null && e.getCause() instanceof MemoryLimitException) { throw new TikaMemoryLimitException(e.getMessage()); @@ -189,10 +243,6 @@ public class CompressorParser extends AbstractParser { throw new TikaException("Unable to uncompress document stream", e); } - MediaType type = getMediaType(cis); - if (!type.equals(MediaType.OCTET_STREAM)) { - metadata.set(CONTENT_TYPE, type.toString()); - } XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); @@ -215,6 +265,8 @@ public class CompressorParser extends AbstractParser { name = name.substring(0, name.length() - 5); } else if (name.endsWith(".pack")) { name = name.substring(0, name.length() - 5); + } else if (name.endsWith(".br")) { + name = name.substring(0, name.length() - 3); } else if (name.length() > 0) { name = GzipUtils.getUncompressedFilename(name); } @@ -234,6 +286,20 @@ public class CompressorParser extends AbstractParser { xhtml.endDocument(); } + /** + * @param metadata + * @return CompressorStream name based on the content-type value + * in metadata or <code>null</code> if not found + * ind + */ + private String getStreamName(Metadata metadata) { + String mimeString = metadata.get(Metadata.CONTENT_TYPE); + if (mimeString == null) { + return null; + } + return MIMES_TO_NAME.get(mimeString); + } + @Field public void setMemoryLimitInKb(int memoryLimitInKb) { this.memoryLimitInKb = memoryLimitInKb; diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java index 26552eb..9a1d579 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java @@ -21,14 +21,25 @@ package org.apache.tika.parser.pkg; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; +import java.io.BufferedWriter; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; import java.util.HashSet; +import java.util.List; import java.util.Set; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.RecursiveParserWrapper; import org.junit.BeforeClass; import org.junit.Test; @@ -39,7 +50,6 @@ public class CompressorParserTest extends TikaTest { @BeforeClass public static void setUp() { - NOT_COVERED.add(MediaType.application("x-brotli")); NOT_COVERED.add(MediaType.application("x-lz4-block")); NOT_COVERED.add(MediaType.application("x-snappy-raw")); NOT_COVERED.add(MediaType.application("deflate64")); @@ -68,6 +78,16 @@ public class CompressorParserTest extends TikaTest { } @Test + public void testBrotli() throws Exception { + Metadata metadata = new Metadata(); + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "testBROTLI_compressed.br"); + List<Metadata> metadataList = getRecursiveMetadata("testBROTLI_compressed.br", metadata); + + assertContains("XXXXXXXXXXYYYYYYYYYY", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertEquals("testBROTLI_compressed", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + } + + @Test public void testCoverage() throws Exception { //test that the package parser covers all inputstreams handled //by CompressorStreamFactory. When we update commons-compress, and they add diff --git a/tika-parsers/src/test/resources/test-documents/testBROTLI_compressed.br b/tika-parsers/src/test/resources/test-documents/testBROTLI_compressed.br new file mode 100644 index 0000000..3769516 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testBROTLI_compressed.br differ -- To stop receiving notification emails like this one, please contact [email protected].
