This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6d066cbcd4589831d4f74f977479d4d902f0e5df Author: Christian <[email protected]> AuthorDate: Tue Jun 16 18:50:46 2020 +0200 add heif mimetype support (#278) fixes TIKA-2830 --- .../org/apache/tika/mime/tika-mimetypes.xml | 32 ++++++++++ .../org/apache/tika/parser/image/HeifParser.java | 68 +++++++++++++++++++++ .../tika/parser/image/ImageMetadataExtractor.java | 14 +++++ .../services/org.apache.tika.parser.Parser | 1 + .../apache/tika/parser/image/HeifParserTest.java | 58 ++++++++++++++++++ .../test/resources/test-documents/IMG_1034.heic | Bin 0 -> 1499892 bytes 6 files changed, 173 insertions(+) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index a7fe00e..64d7dfe 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -5582,6 +5582,38 @@ <glob pattern="*.webp"/> </mime-type> + <mime-type type="image/heic"> + <tika:link>https://en.wikipedia.org/wiki/High_Efficiency_Image_File_Format</tika:link> + <magic priority="50"> + <match value="ftypheic" type="string" offset="4"/> + <match value="ftypheix" type="string" offset="4"/> + </magic> + <glob pattern="*.heic"/> + </mime-type> + + <mime-type type="image/heif"> + <tika:link>https://en.wikipedia.org/wiki/High_Efficiency_Image_File_Format</tika:link> + <magic priority="50"> + <match value="ftypmif1" type="string" offset="4"/> + </magic> + <glob pattern="*.heif"/> + </mime-type> + + <mime-type type="image/heif-sequence"> + <tika:link>https://en.wikipedia.org/wiki/High_Efficiency_Image_File_Format</tika:link> + <magic priority="50"> + <match value="ftypmsf1" type="string" offset="4"/> + </magic> + </mime-type> + + <mime-type type="image/heic-sequence"> + <tika:link>https://en.wikipedia.org/wiki/High_Efficiency_Image_File_Format</tika:link> + <magic priority="50"> + <match value="ftyphevc" type="string" offset="4"/> + <match value="ftyphevx" type="string" offset="4"/> + </magic> + </mime-type> + <mime-type type="image/wmf"> <alias type="image/x-wmf"/> <alias type="application/x-msmetafile"/> diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/HeifParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/HeifParser.java new file mode 100644 index 0000000..9880d3c --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/HeifParser.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.image; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + + +public class HeifParser extends AbstractParser { + private static final Set<MediaType> SUPPORTED_TYPES = + new HashSet<>( + Arrays.asList( + MediaType.image("heif"), + MediaType.image("heif-sequence"), + MediaType.image("heic"), + MediaType.image("heic-sequence") + ) + ); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + TemporaryResources tmp = new TemporaryResources(); + try { + TikaInputStream tis = TikaInputStream.get(stream, tmp); + new ImageMetadataExtractor(metadata).parseHeif(tis.getFile()); + } finally { + tmp.dispose(); + } + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + xhtml.endDocument(); + + } +} diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java index 912c0f1..622f48a 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java @@ -20,6 +20,7 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.io.FileInputStream; import java.text.DecimalFormat; import java.text.DecimalFormatSymbols; import java.text.SimpleDateFormat; @@ -29,6 +30,7 @@ import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.drew.imaging.heif.HeifMetadataReader; import com.drew.imaging.jpeg.JpegMetadataReader; import com.drew.imaging.jpeg.JpegProcessingException; import com.drew.imaging.riff.RiffProcessingException; @@ -150,6 +152,18 @@ public class ImageMetadataExtractor { } } + public void parseHeif(File file) throws IOException, TikaException { + try { + com.drew.metadata.Metadata heifMetadata = new com.drew.metadata.Metadata(); + heifMetadata = HeifMetadataReader.readMetadata(new FileInputStream(file)); + handle(heifMetadata); + } catch (IOException e) { + throw e; + } catch (MetadataException e) { + throw new TikaException("Can't process Heif data", e); + } + } + public void parseRawExif(InputStream stream, int length, boolean needsExifHeader) throws IOException, SAXException, TikaException { byte[] exif; diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser index ceb1399..f0bdb01 100644 --- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser @@ -34,6 +34,7 @@ org.apache.tika.parser.image.ImageParser org.apache.tika.parser.image.PSDParser org.apache.tika.parser.image.TiffParser org.apache.tika.parser.image.WebPParser +org.apache.tika.parser.image.HeifParser org.apache.tika.parser.iptc.IptcAnpaParser org.apache.tika.parser.iwork.IWorkPackageParser org.apache.tika.parser.jpeg.JpegParser diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/image/HeifParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/image/HeifParserTest.java new file mode 100644 index 0000000..681b616 --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/image/HeifParserTest.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.image; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.junit.Test; +import org.xml.sax.helpers.DefaultHandler; + +import java.io.InputStream; + +import static org.junit.Assert.assertEquals; + + +public class HeifParserTest { + + Parser parser = new AutoDetectParser(); + + /* + Example photo in test-documents (IMG_1034.heic) + are in the public domain. These files were retrieved from: + https://github.com/drewnoakes/metadata-extractor-images/tree/master/heic + */ + @Test + public void testSimple() throws Exception { + Metadata metadata = new Metadata(); + InputStream stream = + getClass().getResourceAsStream("/test-documents/IMG_1034.heic"); + + parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); + + assertEquals("heic", metadata.get("Major Brand")); + assertEquals("512 pixels", metadata.get("Width")); + assertEquals("512 pixels", metadata.get("Height")); + assertEquals("image/heic", metadata.get(Metadata.CONTENT_TYPE)); + + IOUtils.closeQuietly(stream); + } + +} diff --git a/tika-parsers/src/test/resources/test-documents/IMG_1034.heic b/tika-parsers/src/test/resources/test-documents/IMG_1034.heic new file mode 100644 index 0000000..9c63182 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/IMG_1034.heic differ
