This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4011 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 627b94175624693dc84296d84a2bd24d499ddc5d Author: tallison <[email protected]> AuthorDate: Mon Apr 10 15:58:49 2023 -0400 TIKA-4011 -- add detection for onix message files --- CHANGES.txt | 2 ++ .../org/apache/tika/mime/tika-mimetypes.xml | 18 ++++++++++++++++ .../java/org/apache/tika/mime/TestMimeTypes.java | 6 ++++++ .../resources/test-documents/testONIXMessage.xml | 21 +++++++++++++++++++ .../test-documents/testONIXMessageShort.xml | 24 ++++++++++++++++++++++ 5 files changed, 71 insertions(+) diff --git a/CHANGES.txt b/CHANGES.txt index eeab24304..bcdea6c7a 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,8 @@ Release 2.7.1 - ??? * Add magic detection for canon raw file types: crw, cr2 and cr3 (TIKA-3991). + * Add detection for ONIX message files (TIKA-4011). + * Add detection and a parser for ActiveMime files (TIKA-3987). * Users may now avoid the ZeroByteFileException via a diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index fdb855f1c..088047914 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -6700,6 +6700,24 @@ <sub-class-of type="application/xml"/> </mime-type> + <mime-type type="application/onix-message+xml"> + <_comment>ONline Information eXchange (ONIX) for books</_comment> + <!-- see https://www.loc.gov/preservation/digital/formats/fdd/fdd000488.shtml --> + <root-XML localName="ONIXMessage" namespaceURI="http://ns.editeur.org/onix/3.0/reference"/> + <!-- do we want a separate mime for this? --> + <root-XML localName="ONIXmessage" namespaceURI="http://ns.editeur.org/onix/3.0/short"/> + <root-XML localName="ONIXMessage"/> + <sub-class-of type="application/xml"/> + </mime-type> + + <mime-type type="application/onix-message-short+xml"> + <_comment>ONline Information eXchange (ONIX) for books</_comment> + <!-- see https://www.loc.gov/preservation/digital/formats/fdd/fdd000488.shtml --> + <root-XML localName="ONIXMessage" namespaceURI="http://ns.editeur.org/onix/3.0/reference"/> + <root-XML localName="ONIXMessage"/> + <sub-class-of type="application/xml"/> + </mime-type> + <mime-type type="text/x-actionscript"> <_comment>ActionScript source code</_comment> <glob pattern="*.as"/> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java index 8a33c4fe6..784b860a3 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -1312,6 +1312,12 @@ public class TestMimeTypes { assertTypeDetection("testPGPEncrypted.gpg", "application/pgp-encrypted"); } + @Test + public void testONIX() throws Exception { + assertTypeByData("application/onix-message+xml", "testONIXMessage.xml"); + assertTypeByData("application/onix-message+xml", "testONIXMessageShort.xml"); + } + private void assertText(byte[] prefix) throws IOException { assertMagic("text/plain", prefix); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testONIXMessage.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testONIXMessage.xml new file mode 100644 index 000000000..e2e098f7d --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testONIXMessage.xml @@ -0,0 +1,21 @@ +<?xml version="1.0" encoding="UTF-8"?> +<ONIXMessage release="3.0" xmlns="http://ns.editeur.org/onix/3.0/reference"> + <Header> + <Sender> + <SenderName>Global Bookinfo</SenderName> + <ContactName>Someone Or Other, blah</ContactName> + <EmailAddress>[email protected]</EmailAddress> + </Sender> + <Addressee> + <AddresseeName>BooksBooksBooks.com</AddresseeName> + </Addressee> + <MessageNumber>231</MessageNumber> + <SentDateTime>20100510T1115-0400</SentDateTime> + <MessageNote>Sample message</MessageNote> + </Header> + <!-- product record 1 of 1 in message --> + <Product> + <RecordReference>com.globalbookinfo.onix.01734529</RecordReference> + <NotificationType>03</NotificationType> + </Product> +</ONIXMessage> \ No newline at end of file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testONIXMessageShort.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testONIXMessageShort.xml new file mode 100644 index 000000000..01a5cd8f8 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testONIXMessageShort.xml @@ -0,0 +1,24 @@ +<?xml version="1.0" encoding="UTF-8"?> +<ONIXmessage release="3.0" xmlns="http://ns.editeur.org/onix/3.0/short"> + <header> + <sender> + <x298>Global Bookinfo</x298> + <x299>Jane King, +1 555 321 7654</x299> + <j272>[email protected]</j272> + </sender> + <addressee> + <x300>BooksBooksBooks.com</x300> + </addressee> + <m180>231</m180> + <x307>20100510T1115-0400</x307> + <m183>Sample message</m183> + </header> + <!-- product record 1 of 1 in message --> + <product> + <a001>com.globalbookinfo.onix.01734529</a001> + <a002>03</a002> + <a194>04</a194> + <recordsourceidentifier> + </recordsourceidentifier> + </product> +</ONIXmessage> \ No newline at end of file
