TIKA-1946 -- initial commit of QuattroPro and WordPerfect parsers. Many thanks to Pascal Essiembre for contributing these!!!
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d011d708 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d011d708 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d011d708 Branch: refs/heads/master Commit: d011d708c21669759af86e855b61d98dae19492e Parents: 7a5b983 Author: tballison <[email protected]> Authored: Wed Dec 21 12:10:08 2016 -0500 Committer: tballison <[email protected]> Committed: Wed Dec 21 12:10:08 2016 -0500 ---------------------------------------------------------------------- CHANGES.txt | 3 + .../org/apache/tika/metadata/QuattroPro.java | 48 +++++++ .../org/apache/tika/metadata/WordPerfect.java | 66 +++++++++ .../org/apache/tika/mime/tika-mimetypes.xml | 13 +- .../parser/wordperfect/QPWTextExtractor.java | 15 ++- .../tika/parser/wordperfect/QuattroPro.java | 72 ---------- .../tika/parser/wordperfect/WP6Constants.java | 2 +- .../tika/parser/wordperfect/WP6FileHeader.java | 25 ++-- .../parser/wordperfect/WP6TextExtractor.java | 21 +-- .../tika/parser/wordperfect/WPInputStream.java | 37 ++++- .../tika/parser/wordperfect/WordPerfect.java | 69 ---------- .../tika/parser/wordperfect/QuattroProTest.java | 40 +++--- .../parser/wordperfect/WPInputStreamTest.java | 134 +++++++++++++++++++ .../parser/wordperfect/WordPerfectTest.java | 25 +--- 14 files changed, 342 insertions(+), 228 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index bd6a45e..a9cf6f1 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,8 @@ Release 1.15 - ?? + * Add parsers for WordPerfect and QuattroPro (.qpw) files. + Contributed by Pascal Essiembre (TIKA-1946). + * Add configurability of "preserve-interword-spacing" to TesseractOCRParser (TIKA-2190). http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java b/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java new file mode 100644 index 0000000..a106e08 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java @@ -0,0 +1,48 @@ +/* Copyright 2016 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata; + +/** + * QuattroPro properties collection. + * @author Pascal Essiembre + */ +public interface QuattroPro { + public static final String QUATTROPRO_METADATA_NAME_PREFIX = "wordperfect"; + + /** + * ID. + */ + Property ID = Property.internalText( + QUATTROPRO_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "Id"); + /** + * Version. + */ + Property VERSION = Property.internalInteger( + QUATTROPRO_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "Version"); + /** + * Build. + */ + Property BUILD = Property.internalInteger( + QUATTROPRO_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "Build"); + /** + * Lowest version. + */ + Property LOWEST_VERSION = Property.internalInteger( + QUATTROPRO_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "LowestVersion"); +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java b/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java new file mode 100644 index 0000000..12ca174 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java @@ -0,0 +1,66 @@ +/* Copyright 2016 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata; + +/** + * WordPerfect properties collection. + * @author Pascal Essiembre + */ +public interface WordPerfect { + public static final String WORDPERFECT_METADATA_NAME_PREFIX = "wordperfect"; + + /** + * File size as defined in document header. + */ + Property FILE_SIZE = Property.internalText( + WORDPERFECT_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileSize"); + /** + * File identifier. + */ + Property FILE_ID = Property.internalText( + WORDPERFECT_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileId"); + /** + * Product type. + */ + Property PRODUCT_TYPE = Property.internalInteger( + WORDPERFECT_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "ProductType"); + /** + * File type. + */ + Property FILE_TYPE = Property.internalInteger( + WORDPERFECT_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileType"); + /** + * Major version. + */ + Property MAJOR_VERSION = Property.internalInteger( + WORDPERFECT_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "MajorVersion"); + /** + * Minor version. + */ + Property MINOR_VERSION = Property.internalInteger( + WORDPERFECT_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "MinorVersion"); + /** + * Is encrypted?. + */ + Property ENCRYPTED = Property.internalBoolean( + WORDPERFECT_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "Encrypted"); +} http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml ---------------------------------------------------------------------- diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 7afdb4d..460bcde 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -2594,10 +2594,10 @@ <tika:link>http://en.wikipedia.org/wiki/WordPerfect</tika:link> <tika:uti>com.corel.wordperfect.doc</tika:uti> <magic priority="60"> - <match value="0xFF575043" type="big32" offset="0"/> + <match value="0xFF575043" type="big32" offset="0"/> <!-- ÿWPC --> </magic> <magic priority="50"> - <match value="application/vnd.wordperfect;" type="string" offset="0"></match> + <match value="application/vnd.wordperfect;" type="string" offset="0"/> </magic> <glob pattern="*.wpd"/> <glob pattern="*.wp"/> @@ -3745,14 +3745,11 @@ <_comment> Quattro Pro - Corel Spreadsheet (part of WordPerfect Office suite) </_comment> - <!-- Conflicts with MS Word .doc format: - <magic priority="90"> - <match value="0xD0CF11E0A1B11AE1" type="string" offset="0"/> - </magic> - --> +<!-- + Let's hold off on this for now until we deconflict with x-123 <magic priority="50"> <match value="0x00000200" type="big32" offset="0"/> - </magic> + </magic> --> <glob pattern="*.qpw"/> <glob pattern="*.wb1"/> <glob pattern="*.wb2"/> http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java index 7192120..a9ba360 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java @@ -25,6 +25,9 @@ import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.QuattroPro; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; @@ -33,7 +36,7 @@ import org.xml.sax.SAXException; * This format appears to be compatible with more recent versions too. * @author Pascal Essiembre */ -public class QPWTextExtractor { +class QPWTextExtractor { private static final Logger LOG = LogManager.getLogger(QPWTextExtractor.class); @@ -50,12 +53,12 @@ public class QPWTextExtractor { ctx.metadata.set(QuattroPro.BUILD, ctx.in.readWPShort()); ctx.in.readWPShort(); // Last saved bits ctx.metadata.set(QuattroPro.LOWEST_VERSION, ctx.in.readWPShort()); - ctx.metadata.set(QuattroPro.PAGE_COUNT, ctx.in.readWPShort()); + ctx.metadata.set(Office.PAGE_COUNT, ctx.in.readWPShort()); ctx.in.skipWPByte(ctx.bodyLength - 14); }}, USER { @Override public void extract(Context ctx) throws IOException { - ctx.metadata.set(QuattroPro.CREATOR, getQstrLabel(ctx.in)); - ctx.metadata.set(QuattroPro.LAST_USER, getQstrLabel(ctx.in)); + ctx.metadata.set(TikaCoreProperties.CREATOR, getQstrLabel(ctx.in)); + ctx.metadata.set(TikaCoreProperties.MODIFIER, getQstrLabel(ctx.in)); }}, EXT_LINK { @Override public void extract(Context ctx) throws IOException, SAXException { @@ -127,7 +130,7 @@ public class QPWTextExtractor { // Holds extractors for each record types we are interested in. // All record types not defined here will be skipped. private static final Map<Integer, Extractor> EXTRACTORS = - new HashMap<Integer, Extractor>(); + new HashMap<>(); static { //--- Global Records --- EXTRACTORS.put(0x0001, Extractor.BOF); // Beginning of file @@ -190,7 +193,7 @@ public class QPWTextExtractor { extractor.extract(ctx); } else { // Use DEBUG to find out what we are ignoring - //Extractor.DEBUG.extract(ctx); +// Extractor.DEBUG.extract(ctx); Extractor.IGNORE.extract(ctx); } } http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroPro.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroPro.java b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroPro.java deleted file mode 100644 index 8270f8d..0000000 --- a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroPro.java +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright 2016 Norconex Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.wordperfect; - -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Property; - -/** - * QuattroPro properties collection. - * @author Pascal Essiembre - */ -public interface QuattroPro { - public static final String QUATTROPRO_METADATA_NAME_PREFIX = "wordperfect"; - - public static final String META_CREATOR = "creator"; - public static final String META_LAST_USER = "last-user"; - - /** - * ID. - */ - Property ID = Property.internalText( - QUATTROPRO_METADATA_NAME_PREFIX - + Metadata.NAMESPACE_PREFIX_DELIMITER + "Id"); - /** - * Version. - */ - Property VERSION = Property.internalInteger( - QUATTROPRO_METADATA_NAME_PREFIX - + Metadata.NAMESPACE_PREFIX_DELIMITER + "Version"); - /** - * Build. - */ - Property BUILD = Property.internalInteger( - QUATTROPRO_METADATA_NAME_PREFIX - + Metadata.NAMESPACE_PREFIX_DELIMITER + "Build"); - /** - * Lowest version. - */ - Property LOWEST_VERSION = Property.internalInteger( - QUATTROPRO_METADATA_NAME_PREFIX - + Metadata.NAMESPACE_PREFIX_DELIMITER + "LowestVersion"); - /** - * Number of pages. - */ - Property PAGE_COUNT = Property.internalInteger( - QUATTROPRO_METADATA_NAME_PREFIX - + Metadata.NAMESPACE_PREFIX_DELIMITER + "PageCount"); - /** - * Creator. - */ - Property CREATOR = Property.internalText( - QUATTROPRO_METADATA_NAME_PREFIX - + Metadata.NAMESPACE_PREFIX_DELIMITER + "Creator"); - /** - * Last User. - */ - Property LAST_USER = Property.internalText( - QUATTROPRO_METADATA_NAME_PREFIX - + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastUser"); -} http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java index f17837c..194bad7 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java @@ -18,7 +18,7 @@ package org.apache.tika.parser.wordperfect; * WordPerfect constant values used for parsing and extracting text. * @author Pascal Essiembre */ -public final class WP6Constants { +final class WP6Constants { public static final String WP6_FILE_ID = "ÿWPC"; http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java index 4b81256..6a95335 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java @@ -14,13 +14,12 @@ */ package org.apache.tika.parser.wordperfect; -import org.apache.commons.lang.builder.ToStringBuilder; /** * WordPerfect file header. * @author Pascal Essiembre */ -public class WP6FileHeader { +class WP6FileHeader { // Normal header private String fileId; @@ -112,17 +111,17 @@ public class WP6FileHeader { @Override public String toString() { - ToStringBuilder builder = new ToStringBuilder(this); - builder.append("fileId", fileId); - builder.append("docAreaPointer", docAreaPointer); - builder.append("productType", productType); - builder.append("fileType", fileType); - builder.append("majorVersion", majorVersion); - builder.append("minorVersion", minorVersion); - builder.append("encrypted", encrypted); - builder.append("indexAreaPointer", indexAreaPointer); - builder.append("fileSize", fileSize); - return builder.toString(); + return "WP6FileHeader{" + + "fileId='" + fileId + '\'' + + ", docAreaPointer=" + docAreaPointer + + ", productType=" + productType + + ", fileType=" + fileType + + ", majorVersion=" + majorVersion + + ", minorVersion=" + minorVersion + + ", encrypted=" + encrypted + + ", indexAreaPointer=" + indexAreaPointer + + ", fileSize=" + fileSize + + '}'; } @Override http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java index 1a2198d..baf999a 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.io.InputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.WordPerfect; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; @@ -26,7 +27,7 @@ import org.xml.sax.SAXException; * This format appears to be compatible with more recent versions too. * @author Pascal Essiembre */ -public class WP6TextExtractor { +class WP6TextExtractor { public void extract( InputStream input, XHTMLContentHandler xhtml, Metadata metadata) @@ -44,7 +45,7 @@ public class WP6TextExtractor { } private void applyMetadata(WP6FileHeader header, Metadata metadata) { - metadata.set(WordPerfect.FILE_SIZE, + metadata.set(WordPerfect.FILE_SIZE, Long.toString(header.getFileSize())); metadata.set(WordPerfect.FILE_ID, header.getFileId()); metadata.set(WordPerfect.PRODUCT_TYPE, header.getProductType()); @@ -92,10 +93,10 @@ public class WP6TextExtractor { out.append('\n'); } else if (c >= 208 && c <= 239) { // Variable-Length Multi-Byte Functions - int subgroup = in.read(); + int subgroup = in.readWP(); int functionSize = in.readWPShort(); for (int i = 0; i < functionSize - 4; i++) { - in.read(); + in.readWP(); } // End-of-Line group @@ -121,9 +122,9 @@ public class WP6TextExtractor { } else if (c == 240) { // extended char - int charval = in.read(); - int charset = in.read(); - in.read(); // closing character + int charval = in.readWP(); + int charset = in.readWP(); + in.readWP(); // closing character //TODO implement all charsets if (charset == 4 || charset == 5) { @@ -184,10 +185,10 @@ public class WP6TextExtractor { in.mark(30); header.setFileId(in.readWPString(4)); // 1-4 header.setDocAreaPointer(in.readWPLong()); // 5-8 - header.setProductType(in.read()); // 9 + header.setProductType(in.readWP()); // 9 header.setFileType(in.readWPChar()); // 10 - header.setMajorVersion(in.read()); // 11 - header.setMinorVersion(in.read()); // 12 + header.setMajorVersion(in.readWP()); // 11 + header.setMinorVersion(in.readWP()); // 12 header.setEncrypted(in.readWPShort() != 0); // 13-14 header.setIndexAreaPointer(in.readWPShort()); // 15-16 try { http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java index 2da276b..67c3200 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java @@ -26,7 +26,7 @@ import org.apache.commons.lang.StringUtils; * {@link InputStream} wrapper adding WordPerfect-specific byte-reading methods. * @author Pascal Essiembre */ -public class WPInputStream extends InputStream { +class WPInputStream extends InputStream { private final DataInputStream in; @@ -102,7 +102,11 @@ public class WPInputStream extends InputStream { * @throws IOException if not enough bytes remain */ public char readWPChar() throws IOException { - return (char) in.read(); + int c = in.read(); + if (c == -1) { + throw new EOFException(); + } + return (char)c; } /** @@ -145,25 +149,48 @@ public class WPInputStream extends InputStream { * @throws IOException if not enough bytes remain */ public String readWPHex() throws IOException { - return StringUtils.leftPad(Integer.toString(read(), 16), 2, '0'); + return StringUtils.leftPad(Integer.toString(readWP(), 16), 2, '0'); } - - + + /** + * Reads a byte + * @return byte read + * @throws IOException if not enough bytes remain + */ + public int readWP() throws IOException { + int i = read(); + if (i == -1) { + throw new EOFException(); + } + return i; + } + + @Override public int read() throws IOException { return in.read(); } + + /** + * Does not guarantee full buffer is read. + */ @Override public int read(byte[] b) throws IOException { return in.read(b); } + /** + * Does not guarantee full buffer is read. + */ @Override public int read(byte[] b, int off, int len) throws IOException { return in.read(b, off, len); } + /** + * Does not guarantee full length is skipped. + */ @Override public long skip(long n) throws IOException { return in.skip(n); http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfect.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfect.java b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfect.java deleted file mode 100644 index aadbd35..0000000 --- a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfect.java +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright 2016 Norconex Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.wordperfect; - -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Property; - -/** - * WordPerfect properties collection. - * @author Pascal Essiembre - */ -public interface WordPerfect { - public static final String WORDPERFECT_METADATA_NAME_PREFIX = "wordperfect"; - - /** - * File size as defined in document header. - */ - Property FILE_SIZE = Property.internalText( - WORDPERFECT_METADATA_NAME_PREFIX - + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileSize"); - /** - * File identifier. - */ - Property FILE_ID = Property.internalText( - WORDPERFECT_METADATA_NAME_PREFIX - + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileId"); - /** - * Product type. - */ - Property PRODUCT_TYPE = Property.internalInteger( - WORDPERFECT_METADATA_NAME_PREFIX - + Metadata.NAMESPACE_PREFIX_DELIMITER + "ProductType"); - /** - * File type. - */ - Property FILE_TYPE = Property.internalInteger( - WORDPERFECT_METADATA_NAME_PREFIX - + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileType"); - /** - * Major version. - */ - Property MAJOR_VERSION = Property.internalInteger( - WORDPERFECT_METADATA_NAME_PREFIX - + Metadata.NAMESPACE_PREFIX_DELIMITER + "MajorVersion"); - /** - * Minor version. - */ - Property MINOR_VERSION = Property.internalInteger( - WORDPERFECT_METADATA_NAME_PREFIX - + Metadata.NAMESPACE_PREFIX_DELIMITER + "MinorVersion"); - /** - * Is encrypted?. - */ - Property ENCRYPTED = Property.internalBoolean( - WORDPERFECT_METADATA_NAME_PREFIX - + Metadata.NAMESPACE_PREFIX_DELIMITER + "Encrypted"); -} http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java index ea5d12f..79dbd1c 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java @@ -15,16 +15,11 @@ package org.apache.tika.parser.wordperfect; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; -import java.io.File; -import java.io.FileInputStream; -import java.io.StringWriter; - -import org.apache.tika.Tika; import org.apache.tika.TikaTest; +import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.WriteOutContentHandler; import org.junit.Test; /** @@ -32,27 +27,26 @@ import org.junit.Test; * @author Pascal Essiembre */ public class QuattroProTest extends TikaTest { - - private Tika tika = new Tika(); - //TODO add testWB/testQUATTRO.wb3 if .wb? files get supported @Test public void testQPW() throws Exception { - File file = getResourceAsFile("/test-documents/testQUATTRO.qpw"); - - Metadata metadata = new Metadata(); - StringWriter writer = new StringWriter(); - tika.getParser().parse( - new FileInputStream(file), - new WriteOutContentHandler(writer), - metadata, - new ParseContext()); - String content = writer.toString(); + XMLResult r = getXML("testQUATTRO.qpw"); assertEquals("application/x-quattro-pro", - metadata.get(Metadata.CONTENT_TYPE)); - assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length); - assertContains("This is an example spreadsheet", content); + r.metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(1, r.metadata.getValues(Metadata.CONTENT_TYPE).length); + assertContains("This is an example spreadsheet", r.xml); + } + + @Test + public void testWB3() throws Exception { + try { + XMLResult r = getXML("testQUATTRO.wb3"); + fail("Should have thrown Tika exception...wb3 is unsupported"); + } catch (TikaException e) { + + } + } } http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WPInputStreamTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WPInputStreamTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WPInputStreamTest.java new file mode 100644 index 0000000..d204e0c --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WPInputStreamTest.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.wordperfect; + +import static org.junit.Assert.fail; + +import java.io.ByteArrayInputStream; +import java.io.EOFException; + +import org.junit.Test; + +public class WPInputStreamTest { + //These test that we guarantee that a byte is read/skipped with the readWPX calls + //but not with the regular read(), read(..), etc. + + @Test + public void testReadByte() throws Exception { + WPInputStream wpInputStream = new WPInputStream(new ByteArrayInputStream(new byte[0])); + try { + wpInputStream.readWPByte(); + fail("should have thrown EOF"); + } catch (EOFException e) { + + } + } + + + @Test + public void testReadShort() throws Exception { + WPInputStream wpInputStream = new WPInputStream(new ByteArrayInputStream(new byte[0])); + try { + wpInputStream.readWPShort(); + fail("should have thrown EOF"); + } catch (EOFException e) { + + } + } + + + @Test + public void testReadChar() throws Exception { + WPInputStream wpInputStream = new WPInputStream(new ByteArrayInputStream(new byte[0])); + try { + wpInputStream.readWPChar(); + fail("should have thrown EOF"); + } catch (EOFException e) { + + } + } + + @Test + public void testReadHex() throws Exception { + WPInputStream wpInputStream = new WPInputStream(new ByteArrayInputStream(new byte[0])); + try { + wpInputStream.readWPHex(); + fail("should have thrown EOF"); + } catch (EOFException e) { + + } + } + + @Test + public void testReadHexString() throws Exception { + WPInputStream wpInputStream = new WPInputStream(new ByteArrayInputStream(new byte[0])); + try { + wpInputStream.readWPHexString(10); + fail("should have thrown EOF"); + } catch (EOFException e) { + + } + } + + @Test + public void testReadLong() throws Exception { + WPInputStream wpInputStream = new WPInputStream(new ByteArrayInputStream(new byte[0])); + try { + wpInputStream.readWPLong(); + fail("should have thrown EOF"); + } catch (EOFException e) { + + } + } + + + @Test + public void testReadString() throws Exception { + WPInputStream wpInputStream = new WPInputStream(new ByteArrayInputStream(new byte[0])); + try { + wpInputStream.readWPString(10); + fail("should have thrown EOF"); + } catch (EOFException e) { + + } + } + + @Test + public void testReadArr() throws Exception { + WPInputStream wpInputStream = new WPInputStream(new ByteArrayInputStream(new byte[0])); + try { + byte[] buffer = new byte[10]; + wpInputStream.read(buffer); + } catch (EOFException e) { + fail("should not have thrown EOF"); + } + } + + @Test + public void testReadArrOffset() throws Exception { + WPInputStream wpInputStream = new WPInputStream(new ByteArrayInputStream(new byte[0])); + try { + byte[] buffer = new byte[10]; + wpInputStream.read(buffer, 0, 2); + } catch (EOFException e) { + fail("should not have thrown EOF"); + } + } + + +} http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java index c3af274..38675aa 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java @@ -16,15 +16,8 @@ package org.apache.tika.parser.wordperfect; import static org.junit.Assert.assertEquals; -import java.io.File; -import java.io.FileInputStream; -import java.io.StringWriter; - -import org.apache.tika.Tika; import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.WriteOutContentHandler; import org.junit.Test; /** @@ -33,24 +26,14 @@ import org.junit.Test; */ public class WordPerfectTest extends TikaTest { - private Tika tika = new Tika(); @Test public void testWordPerfectParser() throws Exception { - File file = getResourceAsFile("/test-documents/testWordPerfect.wpd"); - - Metadata metadata = new Metadata(); - StringWriter writer = new StringWriter(); - tika.getParser().parse( - new FileInputStream(file), - new WriteOutContentHandler(writer), - metadata, - new ParseContext()); - String content = writer.toString(); + XMLResult r = getXML("testWordPerfect.wpd"); assertEquals("application/vnd.wordperfect", - metadata.get(Metadata.CONTENT_TYPE)); - assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length); - assertContains("test test", content); + r.metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(1, r.metadata.getValues(Metadata.CONTENT_TYPE).length); + assertContains("test test", r.xml); } }
