This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 38e393e7d1b48a1490a67baf854972f30d1ed7b3 Author: tallison <[email protected]> AuthorDate: Wed Jun 3 14:02:38 2020 -0400 TIKA-3104 -- add bplist subtype detector --- .../org/apache/tika/mime/tika-mimetypes.xml | 5 + .../apache/tika/parser/apple/BPListDetector.java | 114 +++++++++++++++++++++ .../apple/{PListParser.java => BPListParser.java} | 33 ++++-- .../services/org.apache.tika.detect.Detector | 1 + .../services/org.apache.tika.parser.Parser | 2 +- .../tika/detect/TestContainerAwareDetector.java | 7 ++ ...{PListParserTest.java => BPListParserTest.java} | 18 +++- .../resources/test-documents/testMemgraph.memgraph | Bin 0 -> 646412 bytes 8 files changed, 169 insertions(+), 11 deletions(-) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index f16ae5a..cf95c0d 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -3702,6 +3702,11 @@ <glob pattern="*.iso"/> </mime-type> + <mime-type type="application/x-itunes-bplist"> + <_comment>Apple iTunes Binary Property List</_comment> + <sub-class-of type="application/x-bplist"/> + </mime-type> + <mime-type type="application/x-itunes-ipa"> <sub-class-of type="application/zip"/> <_comment>Apple iOS IPA AppStore file</_comment> diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListDetector.java new file mode 100644 index 0000000..6631fa7 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListDetector.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.apple; + +import com.dd.plist.NSDictionary; +import com.dd.plist.NSObject; +import com.dd.plist.PropertyListFormatException; +import com.dd.plist.PropertyListParser; +import org.apache.poi.util.IOUtils; +import org.apache.tika.detect.Detector; +import org.apache.tika.io.IOExceptionWithCause; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.xml.sax.SAXException; + +import javax.xml.parsers.ParserConfigurationException; +import java.io.IOException; +import java.io.InputStream; +import java.text.ParseException; +import java.util.Set; + +/** + * Parser that wraps com.dd.plist's PList parser to handle + * binary property lists + */ +public class BPListDetector implements Detector { + + + MediaType MEMGRAPH = MediaType.application("x-memgraph"); + MediaType WEBARCHIVE = MediaType.application("x-webarchive"); + MediaType BPLIST = MediaType.application("x-bplist"); + MediaType ITUNES = MediaType.application("x-itunes-bplist"); + + /** + * @param input input stream must support reset + * @param metadata input metadata for the document + * @return + * @throws IOException + */ + @Override + public MediaType detect(InputStream input, Metadata metadata) throws IOException { + if (input == null) { + return MediaType.OCTET_STREAM; + } + input.mark(8); + byte[] bytes = new byte[8]; + + try { + int read = IOUtils.readFully(input, bytes); + if (read < 6) { + return MediaType.OCTET_STREAM; + } + } catch (IOException e) { + return MediaType.OCTET_STREAM; + } finally { + input.reset(); + } + + int i = 0; + if (bytes[i++] != 'b' || bytes[i++] != 'p' + || bytes[i++] != 'l' || bytes[i++] != 'i' + || bytes[i++] != 's' || bytes[i++] != 't') { + return MediaType.OCTET_STREAM; + } + //TODO: extract the version with the next two bytes if they were read + NSObject rootObj = null; + try { + if (input instanceof TikaInputStream && ((TikaInputStream) input).hasFile()) { + rootObj = PropertyListParser.parse(((TikaInputStream) input).getFile()); + } else { + rootObj = PropertyListParser.parse(input); + } + if (input instanceof TikaInputStream) { + ((TikaInputStream) input).setOpenContainer(rootObj); + } + } catch (PropertyListFormatException | ParseException | ParserConfigurationException | SAXException e) { + throw new IOExceptionWithCause("problem parsing root", e); + } + if (rootObj instanceof NSDictionary) { + return detectOnKeys(((NSDictionary) rootObj).getHashMap().keySet()); + } + return BPLIST; + } + + private MediaType detectOnKeys(Set<String> keySet) { + if (keySet.contains("nodes") && keySet.contains("edges") + && keySet.contains("graphEncodingVersion")) { + return MEMGRAPH; + } else if (keySet.contains("WebMainResource") //&& keySet.contains("WebSubresources") should we require this? + ) { + return WEBARCHIVE; + } else if (keySet.contains("Playlists") && keySet.contains("Tracks") + && keySet.contains("Music Folder")) { + return ITUNES; + } //if it contains $archiver and $objects, it is a bplist inside a webarchive + + return BPLIST; + } +} diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListParser.java similarity index 84% rename from tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java rename to tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListParser.java index 5d4cc3e..29d0fb9 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListParser.java @@ -26,6 +26,7 @@ import com.dd.plist.NSSet; import com.dd.plist.NSString; import com.dd.plist.PropertyListFormatException; import com.dd.plist.PropertyListParser; +import com.dd.plist.UID; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; @@ -54,7 +55,7 @@ import java.util.Set; * Parser for Apple's plist and bplist. This is a wrapper around * com.googlecode.plist:dd-plist */ -public class PListParser extends AbstractParser { +public class BPListParser extends AbstractParser { private static final String ARR = "array"; private static final String DATA = "data"; @@ -65,6 +66,7 @@ public class PListParser extends AbstractParser { private static final String PLIST = "plist"; private static final String SET = "set"; private static final String STRING = "string"; + private static final String UID = "uid"; private static final Set<MediaType> SUPPORTED_TYPES = @@ -82,14 +84,22 @@ public class PListParser extends AbstractParser { EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US); NSObject rootObj = null; - try { - if (stream instanceof TikaInputStream && ((TikaInputStream) stream).hasFile()) { - rootObj = PropertyListParser.parse(((TikaInputStream) stream).getFile()); - } else { - rootObj = PropertyListParser.parse(stream); + //if this already went through the PListDetector, + //there should be an NSObject in the open container + if (stream instanceof TikaInputStream) { + rootObj = (NSObject) ((TikaInputStream)stream).getOpenContainer(); + } + + if (rootObj == null) { + try { + if (stream instanceof TikaInputStream && ((TikaInputStream) stream).hasFile()) { + rootObj = PropertyListParser.parse(((TikaInputStream) stream).getFile()); + } else { + rootObj = PropertyListParser.parse(stream); + } + } catch (PropertyListFormatException | ParseException | ParserConfigurationException e) { + throw new TikaException("problem parsing root", e); } - } catch (PropertyListFormatException|ParseException|ParserConfigurationException e) { - throw new TikaException("problem parsing root", e); } XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); State state = new State(xhtml, metadata, embeddedDocumentExtractor, df); @@ -133,8 +143,13 @@ public class PListParser extends AbstractParser { state.xhtml.startElement(SET); parseSet((NSSet)obj, state); state.xhtml.endElement(SET); + } else if (obj instanceof UID) { + //do we want to do anything with obj.getBytes() + state.xhtml.element(UID, ((UID)obj).getName()); } else { - throw new UnsupportedOperationException("don't yet support this type of object: "+obj.getClass()); + throw new UnsupportedOperationException( + "don't yet support this type of object: "+obj.getClass() + + " Please open an issue on our tracker"); } } diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector index 8a3d85f..5e766c6 100644 --- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector +++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector @@ -16,3 +16,4 @@ org.apache.tika.detect.OverrideDetector org.apache.tika.parser.microsoft.POIFSContainerDetector org.apache.tika.parser.pkg.ZipContainerDetector +org.apache.tika.parser.apple.BPListDetector diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser index 028de26..ceb1399 100644 --- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser @@ -14,7 +14,7 @@ # limitations under the License. org.apache.tika.parser.apple.AppleSingleFileParser -org.apache.tika.parser.apple.PListParser +org.apache.tika.parser.apple.BPListParser org.apache.tika.parser.asm.ClassParser org.apache.tika.parser.audio.AudioParser org.apache.tika.parser.audio.MidiParser diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java index 2fa274a..2b4c39a 100644 --- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java +++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java @@ -563,4 +563,11 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest { assertEquals("application/vnd.oasis.opendocument.presentation", metadataList.get(2).get(Metadata.CONTENT_TYPE)); } + + @Test + public void testBPList() throws Exception { + assertTypeByData("testMemgraph.memgraph", "application/x-memgraph"); + assertTypeByData("testWEBARCHIVE.webarchive", "application/x-webarchive"); + assertTypeByData("testBPList.bplist", "application/x-itunes-bplist"); + } } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/apple/BPListParserTest.java similarity index 68% rename from tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java rename to tika-parsers/src/test/java/org/apache/tika/parser/apple/BPListParserTest.java index 9d78548..9fad311 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/apple/BPListParserTest.java @@ -17,7 +17,9 @@ package org.apache.tika.parser.apple; import org.apache.tika.TikaTest; +import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.junit.Test; @@ -26,7 +28,7 @@ import java.util.List; import static org.junit.Assert.assertEquals; -public class PListParserTest extends TikaTest { +public class BPListParserTest extends TikaTest { @Test public void testBasicBinaryPList() throws Exception { @@ -35,10 +37,24 @@ public class PListParserTest extends TikaTest { List<Metadata> metadataList = getRecursiveMetadata("testBPList.bplist"); assertEquals(21, metadataList.size()); Metadata m = metadataList.get(0); + assertEquals("application/x-itunes-bplist", m.get(Metadata.CONTENT_TYPE)); String content = m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); assertContains("<key>Application Version</key><string>9.0", content); //TODO -- bad encoding right after this...smart quote? assertContains("<string>90", content); } + + @Test + public void testWebArchive() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testWEBARCHIVE.webarchive"); + assertEquals(12, metadataList.size()); + Metadata m0 = metadataList.get(0); + assertEquals("application/x-webarchive", m0.get(Metadata.CONTENT_TYPE)); + Metadata m1 = metadataList.get(1); + String content = m1.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); + assertContains("December 2008: Apache Tika Release", content); + } + + //TODO -- add unit tests for memgraph } diff --git a/tika-parsers/src/test/resources/test-documents/testMemgraph.memgraph b/tika-parsers/src/test/resources/test-documents/testMemgraph.memgraph new file mode 100644 index 0000000..cb7df3e Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testMemgraph.memgraph differ
