TIKA-1946 -- initial commit of QuattroPro and WordPerfect parsers.  Many thanks 
to Pascal Essiembre for contributing these!!!


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d011d708
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d011d708
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d011d708

Branch: refs/heads/master
Commit: d011d708c21669759af86e855b61d98dae19492e
Parents: 7a5b983
Author: tballison <[email protected]>
Authored: Wed Dec 21 12:10:08 2016 -0500
Committer: tballison <[email protected]>
Committed: Wed Dec 21 12:10:08 2016 -0500

----------------------------------------------------------------------
 CHANGES.txt                                     |   3 +
 .../org/apache/tika/metadata/QuattroPro.java    |  48 +++++++
 .../org/apache/tika/metadata/WordPerfect.java   |  66 +++++++++
 .../org/apache/tika/mime/tika-mimetypes.xml     |  13 +-
 .../parser/wordperfect/QPWTextExtractor.java    |  15 ++-
 .../tika/parser/wordperfect/QuattroPro.java     |  72 ----------
 .../tika/parser/wordperfect/WP6Constants.java   |   2 +-
 .../tika/parser/wordperfect/WP6FileHeader.java  |  25 ++--
 .../parser/wordperfect/WP6TextExtractor.java    |  21 +--
 .../tika/parser/wordperfect/WPInputStream.java  |  37 ++++-
 .../tika/parser/wordperfect/WordPerfect.java    |  69 ----------
 .../tika/parser/wordperfect/QuattroProTest.java |  40 +++---
 .../parser/wordperfect/WPInputStreamTest.java   | 134 +++++++++++++++++++
 .../parser/wordperfect/WordPerfectTest.java     |  25 +---
 14 files changed, 342 insertions(+), 228 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index bd6a45e..a9cf6f1 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.15 - ??
 
+  * Add parsers for WordPerfect and QuattroPro (.qpw) files.
+    Contributed by Pascal Essiembre (TIKA-1946).
+
   * Add configurability of "preserve-interword-spacing" to
     TesseractOCRParser (TIKA-2190).
 

http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java 
b/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java
new file mode 100644
index 0000000..a106e08
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java
@@ -0,0 +1,48 @@
+/* Copyright 2016 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+/**
+ * QuattroPro properties collection.
+ * @author Pascal Essiembre
+ */
+public interface QuattroPro {
+   public static final String QUATTROPRO_METADATA_NAME_PREFIX = "wordperfect";
+
+   /**
+    * ID.
+    */
+   Property ID = Property.internalText(
+           QUATTROPRO_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "Id");
+   /**
+    * Version.
+    */
+   Property VERSION = Property.internalInteger(
+           QUATTROPRO_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "Version");
+   /**
+    * Build.
+    */
+   Property BUILD = Property.internalInteger(
+           QUATTROPRO_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "Build");
+   /**
+    * Lowest version.
+    */
+   Property LOWEST_VERSION = Property.internalInteger(
+           QUATTROPRO_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "LowestVersion");
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java 
b/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java
new file mode 100644
index 0000000..12ca174
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java
@@ -0,0 +1,66 @@
+/* Copyright 2016 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+/**
+ * WordPerfect properties collection.
+ * @author Pascal Essiembre
+ */
+public interface WordPerfect {
+   public static final String WORDPERFECT_METADATA_NAME_PREFIX = "wordperfect";
+
+   /**
+    * File size as defined in document header. 
+    */
+   Property FILE_SIZE = Property.internalText(
+           WORDPERFECT_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileSize");
+   /**
+    * File identifier. 
+    */
+   Property FILE_ID = Property.internalText(
+           WORDPERFECT_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileId");
+   /**
+    * Product type. 
+    */
+   Property PRODUCT_TYPE = Property.internalInteger(
+           WORDPERFECT_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "ProductType");
+   /**
+    * File type. 
+    */
+   Property FILE_TYPE = Property.internalInteger(
+           WORDPERFECT_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileType");
+   /**
+    * Major version. 
+    */
+   Property MAJOR_VERSION = Property.internalInteger(
+           WORDPERFECT_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "MajorVersion");
+   /**
+    * Minor version. 
+    */
+   Property MINOR_VERSION = Property.internalInteger(
+           WORDPERFECT_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "MinorVersion");
+   /**
+    * Is encrypted?. 
+    */
+   Property ENCRYPTED = Property.internalBoolean(
+           WORDPERFECT_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "Encrypted");
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 7afdb4d..460bcde 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -2594,10 +2594,10 @@
     <tika:link>http://en.wikipedia.org/wiki/WordPerfect</tika:link>
     <tika:uti>com.corel.wordperfect.doc</tika:uti>
     <magic priority="60">
-      <match value="0xFF575043" type="big32" offset="0"/>
+      <match value="0xFF575043" type="big32" offset="0"/> <!-- ÿWPC -->
     </magic>
     <magic priority="50">
-      <match value="application/vnd.wordperfect;" type="string" 
offset="0"></match>
+      <match value="application/vnd.wordperfect;" type="string" offset="0"/>
     </magic>
     <glob pattern="*.wpd"/>
     <glob pattern="*.wp"/>
@@ -3745,14 +3745,11 @@
     <_comment>
       Quattro Pro - Corel Spreadsheet (part of WordPerfect Office suite)
     </_comment>
-    <!-- Conflicts with MS Word .doc format:
-    <magic priority="90">
-      <match value="0xD0CF11E0A1B11AE1" type="string" offset="0"/>
-    </magic>
-     -->
+<!--
+    Let's hold off on this for now until we deconflict with x-123
     <magic priority="50">
       <match value="0x00000200" type="big32" offset="0"/>
-    </magic>
+    </magic> -->
     <glob pattern="*.qpw"/>
     <glob pattern="*.wb1"/> 
     <glob pattern="*.wb2"/>

http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java
index 7192120..a9ba360 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java
@@ -25,6 +25,9 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.QuattroPro;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
@@ -33,7 +36,7 @@ import org.xml.sax.SAXException;
  * This format appears to be compatible with more recent versions too.
  * @author Pascal Essiembre
  */
-public class QPWTextExtractor {
+class QPWTextExtractor {
 
     private static final Logger LOG = 
             LogManager.getLogger(QPWTextExtractor.class);
@@ -50,12 +53,12 @@ public class QPWTextExtractor {
             ctx.metadata.set(QuattroPro.BUILD, ctx.in.readWPShort());
             ctx.in.readWPShort(); // Last saved bits
             ctx.metadata.set(QuattroPro.LOWEST_VERSION, ctx.in.readWPShort());
-            ctx.metadata.set(QuattroPro.PAGE_COUNT, ctx.in.readWPShort());
+            ctx.metadata.set(Office.PAGE_COUNT, ctx.in.readWPShort());
             ctx.in.skipWPByte(ctx.bodyLength - 14);
         }},
         USER { @Override public void extract(Context ctx) throws IOException {
-            ctx.metadata.set(QuattroPro.CREATOR, getQstrLabel(ctx.in));
-            ctx.metadata.set(QuattroPro.LAST_USER, getQstrLabel(ctx.in));
+            ctx.metadata.set(TikaCoreProperties.CREATOR, getQstrLabel(ctx.in));
+            ctx.metadata.set(TikaCoreProperties.MODIFIER, 
getQstrLabel(ctx.in));
         }},
         EXT_LINK { @Override public void extract(Context ctx) 
                 throws IOException, SAXException {
@@ -127,7 +130,7 @@ public class QPWTextExtractor {
     // Holds extractors for each record types we are interested in.
     // All record types not defined here will be skipped.
     private static final Map<Integer, Extractor> EXTRACTORS = 
-            new HashMap<Integer, Extractor>();
+            new HashMap<>();
     static {
         //--- Global Records ---
         EXTRACTORS.put(0x0001, Extractor.BOF);     // Beginning of file
@@ -190,7 +193,7 @@ public class QPWTextExtractor {
                     extractor.extract(ctx);
                 } else {
                     // Use DEBUG to find out what we are ignoring
-                    //Extractor.DEBUG.extract(ctx);
+//                    Extractor.DEBUG.extract(ctx);
                     Extractor.IGNORE.extract(ctx);
                 }
             }

http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroPro.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroPro.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroPro.java
deleted file mode 100644
index 8270f8d..0000000
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroPro.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2016 Norconex Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.wordperfect;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-
-/**
- * QuattroPro properties collection.
- * @author Pascal Essiembre
- */
-public interface QuattroPro {
-   public static final String QUATTROPRO_METADATA_NAME_PREFIX = "wordperfect";
-
-   public static final String META_CREATOR = "creator";
-   public static final String META_LAST_USER = "last-user";
-   
-   /**
-    * ID. 
-    */
-   Property ID = Property.internalText(
-           QUATTROPRO_METADATA_NAME_PREFIX
-                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "Id");
-   /**
-    * Version. 
-    */
-   Property VERSION = Property.internalInteger(
-           QUATTROPRO_METADATA_NAME_PREFIX
-                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "Version");
-   /**
-    * Build. 
-    */
-   Property BUILD = Property.internalInteger(
-           QUATTROPRO_METADATA_NAME_PREFIX
-                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "Build");
-   /**
-    * Lowest version. 
-    */
-   Property LOWEST_VERSION = Property.internalInteger(
-           QUATTROPRO_METADATA_NAME_PREFIX
-                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "LowestVersion");
-   /**
-    * Number of pages. 
-    */
-   Property PAGE_COUNT = Property.internalInteger(
-           QUATTROPRO_METADATA_NAME_PREFIX
-                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "PageCount");
-   /**
-    * Creator. 
-    */
-   Property CREATOR = Property.internalText(
-           QUATTROPRO_METADATA_NAME_PREFIX
-                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "Creator");
-   /**
-    * Last User. 
-    */
-   Property LAST_USER = Property.internalText(
-           QUATTROPRO_METADATA_NAME_PREFIX
-                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastUser");
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java
index f17837c..194bad7 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java
@@ -18,7 +18,7 @@ package org.apache.tika.parser.wordperfect;
  * WordPerfect constant values used for parsing and extracting text.
  * @author Pascal Essiembre
  */
-public final class WP6Constants {
+final class WP6Constants {
 
 
     public static final String WP6_FILE_ID = "ÿWPC";

http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java
index 4b81256..6a95335 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java
@@ -14,13 +14,12 @@
  */
 package org.apache.tika.parser.wordperfect;
 
-import org.apache.commons.lang.builder.ToStringBuilder;
 
 /**
  * WordPerfect file header.
  * @author Pascal Essiembre
  */
-public class WP6FileHeader {
+class WP6FileHeader {
 
     // Normal header
     private String fileId;
@@ -112,17 +111,17 @@ public class WP6FileHeader {
 
     @Override
     public String toString() {
-        ToStringBuilder builder = new ToStringBuilder(this);
-        builder.append("fileId", fileId);
-        builder.append("docAreaPointer", docAreaPointer);
-        builder.append("productType", productType);
-        builder.append("fileType", fileType);
-        builder.append("majorVersion", majorVersion);
-        builder.append("minorVersion", minorVersion);
-        builder.append("encrypted", encrypted);
-        builder.append("indexAreaPointer", indexAreaPointer);
-        builder.append("fileSize", fileSize);
-        return builder.toString();
+        return "WP6FileHeader{" +
+                "fileId='" + fileId + '\'' +
+                ", docAreaPointer=" + docAreaPointer +
+                ", productType=" + productType +
+                ", fileType=" + fileType +
+                ", majorVersion=" + majorVersion +
+                ", minorVersion=" + minorVersion +
+                ", encrypted=" + encrypted +
+                ", indexAreaPointer=" + indexAreaPointer +
+                ", fileSize=" + fileSize +
+                '}';
     }
 
     @Override

http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java
index 1a2198d..baf999a 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java
@@ -18,6 +18,7 @@ import java.io.IOException;
 import java.io.InputStream;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.WordPerfect;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
@@ -26,7 +27,7 @@ import org.xml.sax.SAXException;
  * This format appears to be compatible with more recent versions too.
  * @author Pascal Essiembre
  */
-public class WP6TextExtractor {
+class WP6TextExtractor {
 
     public void extract(
             InputStream input, XHTMLContentHandler xhtml, Metadata metadata) 
@@ -44,7 +45,7 @@ public class WP6TextExtractor {
     }
 
     private void applyMetadata(WP6FileHeader header, Metadata metadata) {
-        metadata.set(WordPerfect.FILE_SIZE, 
+        metadata.set(WordPerfect.FILE_SIZE,
                 Long.toString(header.getFileSize()));
         metadata.set(WordPerfect.FILE_ID, header.getFileId());
         metadata.set(WordPerfect.PRODUCT_TYPE, header.getProductType());
@@ -92,10 +93,10 @@ public class WP6TextExtractor {
                 out.append('\n');
             } else if (c >= 208 && c <= 239) {
                 // Variable-Length Multi-Byte Functions
-                int subgroup = in.read();
+                int subgroup = in.readWP();
                 int functionSize = in.readWPShort();
                 for (int i = 0; i < functionSize - 4; i++) {
-                    in.read(); 
+                    in.readWP();
                 }
                 
                 // End-of-Line group
@@ -121,9 +122,9 @@ public class WP6TextExtractor {
                 
             } else if (c == 240) {
                 // extended char
-                int charval = in.read();
-                int charset = in.read();
-                in.read(); // closing character
+                int charval = in.readWP();
+                int charset = in.readWP();
+                in.readWP(); // closing character
   
                 //TODO implement all charsets
                 if (charset == 4 || charset == 5) {
@@ -184,10 +185,10 @@ public class WP6TextExtractor {
         in.mark(30);
         header.setFileId(in.readWPString(4));         // 1-4
         header.setDocAreaPointer(in.readWPLong());    // 5-8
-        header.setProductType(in.read());             // 9
+        header.setProductType(in.readWP());             // 9
         header.setFileType(in.readWPChar());          // 10
-        header.setMajorVersion(in.read());            // 11
-        header.setMinorVersion(in.read());            // 12
+        header.setMajorVersion(in.readWP());            // 11
+        header.setMinorVersion(in.readWP());            // 12
         header.setEncrypted(in.readWPShort() != 0);   // 13-14
         header.setIndexAreaPointer(in.readWPShort()); // 15-16
         try {

http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java
index 2da276b..67c3200 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java
@@ -26,7 +26,7 @@ import org.apache.commons.lang.StringUtils;
  * {@link InputStream} wrapper adding WordPerfect-specific byte-reading 
methods.
  * @author Pascal Essiembre
  */
-public class WPInputStream extends InputStream {
+class WPInputStream extends InputStream {
 
     private final DataInputStream in;
     
@@ -102,7 +102,11 @@ public class WPInputStream extends InputStream {
      * @throws IOException if not enough bytes remain
      */
     public char readWPChar() throws IOException {
-        return (char) in.read();
+        int c = in.read();
+        if (c == -1) {
+            throw new EOFException();
+        }
+        return (char)c;
     }
 
     /**
@@ -145,25 +149,48 @@ public class WPInputStream extends InputStream {
      * @throws IOException if not enough bytes remain
      */
     public String readWPHex() throws IOException {
-        return StringUtils.leftPad(Integer.toString(read(), 16), 2, '0');
+        return StringUtils.leftPad(Integer.toString(readWP(), 16), 2, '0');
     }
-    
-    
+
+    /**
+     * Reads a byte
+     * @return byte read
+     * @throws IOException if not enough bytes remain
+     */
+    public int readWP() throws IOException {
+        int i = read();
+        if (i == -1) {
+            throw new EOFException();
+        }
+        return i;
+    }
+
+
     @Override
     public int read() throws IOException {
         return in.read();
     }
 
+
+    /**
+     * Does not guarantee full buffer is read.
+     */
     @Override
     public int read(byte[] b) throws IOException {
         return in.read(b);
     }
 
+    /**
+     * Does not guarantee full buffer is read.
+     */
     @Override
     public int read(byte[] b, int off, int len) throws IOException {
         return in.read(b, off, len);
     }
 
+    /**
+     * Does not guarantee full length is skipped.
+     */
     @Override
     public long skip(long n) throws IOException {
         return in.skip(n);

http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfect.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfect.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfect.java
deleted file mode 100644
index aadbd35..0000000
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfect.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2016 Norconex Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.wordperfect;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-
-/**
- * WordPerfect properties collection.
- * @author Pascal Essiembre
- */
-public interface WordPerfect {
-   public static final String WORDPERFECT_METADATA_NAME_PREFIX = "wordperfect";
-
-   /**
-    * File size as defined in document header. 
-    */
-   Property FILE_SIZE = Property.internalText(
-           WORDPERFECT_METADATA_NAME_PREFIX
-                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileSize");
-   /**
-    * File identifier. 
-    */
-   Property FILE_ID = Property.internalText(
-           WORDPERFECT_METADATA_NAME_PREFIX
-                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileId");
-   /**
-    * Product type. 
-    */
-   Property PRODUCT_TYPE = Property.internalInteger(
-           WORDPERFECT_METADATA_NAME_PREFIX
-                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "ProductType");
-   /**
-    * File type. 
-    */
-   Property FILE_TYPE = Property.internalInteger(
-           WORDPERFECT_METADATA_NAME_PREFIX
-                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileType");
-   /**
-    * Major version. 
-    */
-   Property MAJOR_VERSION = Property.internalInteger(
-           WORDPERFECT_METADATA_NAME_PREFIX
-                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "MajorVersion");
-   /**
-    * Minor version. 
-    */
-   Property MINOR_VERSION = Property.internalInteger(
-           WORDPERFECT_METADATA_NAME_PREFIX
-                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "MinorVersion");
-   /**
-    * Is encrypted?. 
-    */
-   Property ENCRYPTED = Property.internalBoolean(
-           WORDPERFECT_METADATA_NAME_PREFIX
-                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "Encrypted");
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java
index ea5d12f..79dbd1c 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java
@@ -15,16 +15,11 @@
 package org.apache.tika.parser.wordperfect;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
 
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.StringWriter;
-
-import org.apache.tika.Tika;
 import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.WriteOutContentHandler;
 import org.junit.Test;
 
 /**
@@ -32,27 +27,26 @@ import org.junit.Test;
  * @author Pascal Essiembre
  */
 public class QuattroProTest extends TikaTest {
-
-    private Tika tika = new Tika();
-
     //TODO add testWB/testQUATTRO.wb3 if .wb? files get supported
     
     @Test
     public void testQPW() throws Exception {
-        File file = getResourceAsFile("/test-documents/testQUATTRO.qpw");
-
-        Metadata metadata = new Metadata();
-        StringWriter writer = new StringWriter();
-        tika.getParser().parse(
-                new FileInputStream(file),
-                new WriteOutContentHandler(writer),
-                metadata,
-                new ParseContext());
-        String content = writer.toString();
 
+        XMLResult r = getXML("testQUATTRO.qpw");
         assertEquals("application/x-quattro-pro", 
-                metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length);
-        assertContains("This is an example spreadsheet", content);
+                r.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(1, r.metadata.getValues(Metadata.CONTENT_TYPE).length);
+        assertContains("This is an example spreadsheet", r.xml);
+    }
+
+    @Test
+    public void testWB3() throws Exception {
+        try {
+            XMLResult r = getXML("testQUATTRO.wb3");
+            fail("Should have thrown Tika exception...wb3 is unsupported");
+        } catch (TikaException e) {
+
+        }
+
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WPInputStreamTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WPInputStreamTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WPInputStreamTest.java
new file mode 100644
index 0000000..d204e0c
--- /dev/null
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WPInputStreamTest.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.wordperfect;
+
+import static org.junit.Assert.fail;
+
+import java.io.ByteArrayInputStream;
+import java.io.EOFException;
+
+import org.junit.Test;
+
+public class WPInputStreamTest {
+    //These test that we guarantee that a byte is read/skipped with the 
readWPX calls
+    //but not with the regular read(), read(..), etc.
+
+    @Test
+    public void testReadByte() throws Exception {
+        WPInputStream wpInputStream = new WPInputStream(new 
ByteArrayInputStream(new byte[0]));
+        try {
+            wpInputStream.readWPByte();
+            fail("should have thrown EOF");
+        } catch (EOFException e) {
+
+        }
+    }
+
+
+    @Test
+    public void testReadShort() throws Exception {
+        WPInputStream wpInputStream = new WPInputStream(new 
ByteArrayInputStream(new byte[0]));
+        try {
+            wpInputStream.readWPShort();
+            fail("should have thrown EOF");
+        } catch (EOFException e) {
+
+        }
+    }
+
+
+    @Test
+    public void testReadChar() throws Exception {
+        WPInputStream wpInputStream = new WPInputStream(new 
ByteArrayInputStream(new byte[0]));
+        try {
+            wpInputStream.readWPChar();
+            fail("should have thrown EOF");
+        } catch (EOFException e) {
+
+        }
+    }
+
+    @Test
+    public void testReadHex() throws Exception {
+        WPInputStream wpInputStream = new WPInputStream(new 
ByteArrayInputStream(new byte[0]));
+        try {
+            wpInputStream.readWPHex();
+            fail("should have thrown EOF");
+        } catch (EOFException e) {
+
+        }
+    }
+
+    @Test
+    public void testReadHexString() throws Exception {
+        WPInputStream wpInputStream = new WPInputStream(new 
ByteArrayInputStream(new byte[0]));
+        try {
+            wpInputStream.readWPHexString(10);
+            fail("should have thrown EOF");
+        } catch (EOFException e) {
+
+        }
+    }
+
+    @Test
+    public void testReadLong() throws Exception {
+        WPInputStream wpInputStream = new WPInputStream(new 
ByteArrayInputStream(new byte[0]));
+        try {
+            wpInputStream.readWPLong();
+            fail("should have thrown EOF");
+        } catch (EOFException e) {
+
+        }
+    }
+
+
+    @Test
+    public void testReadString() throws Exception {
+        WPInputStream wpInputStream = new WPInputStream(new 
ByteArrayInputStream(new byte[0]));
+        try {
+            wpInputStream.readWPString(10);
+            fail("should have thrown EOF");
+        } catch (EOFException e) {
+
+        }
+    }
+
+    @Test
+    public void testReadArr() throws Exception {
+        WPInputStream wpInputStream = new WPInputStream(new 
ByteArrayInputStream(new byte[0]));
+        try {
+            byte[] buffer = new byte[10];
+            wpInputStream.read(buffer);
+        } catch (EOFException e) {
+            fail("should not have thrown EOF");
+        }
+    }
+
+    @Test
+    public void testReadArrOffset() throws Exception {
+        WPInputStream wpInputStream = new WPInputStream(new 
ByteArrayInputStream(new byte[0]));
+        try {
+            byte[] buffer = new byte[10];
+            wpInputStream.read(buffer, 0, 2);
+        } catch (EOFException e) {
+            fail("should not have thrown EOF");
+        }
+    }
+
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/d011d708/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java
index c3af274..38675aa 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java
@@ -16,15 +16,8 @@ package org.apache.tika.parser.wordperfect;
 
 import static org.junit.Assert.assertEquals;
 
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.StringWriter;
-
-import org.apache.tika.Tika;
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.WriteOutContentHandler;
 import org.junit.Test;
 
 /**
@@ -33,24 +26,14 @@ import org.junit.Test;
  */
 public class WordPerfectTest extends TikaTest {
 
-    private Tika tika = new Tika();
 
     @Test
     public void testWordPerfectParser() throws Exception {
-        File file = getResourceAsFile("/test-documents/testWordPerfect.wpd");
-
-        Metadata metadata = new Metadata();
-        StringWriter writer = new StringWriter();
-        tika.getParser().parse(
-                new FileInputStream(file),
-                new WriteOutContentHandler(writer),
-                metadata,
-                new ParseContext());
-        String content = writer.toString();
 
+        XMLResult r = getXML("testWordPerfect.wpd");
         assertEquals("application/vnd.wordperfect", 
-                metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length);
-        assertContains("test test", content);
+                r.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(1, r.metadata.getValues(Metadata.CONTENT_TYPE).length);
+        assertContains("test test", r.xml);
     }
 }

Reply via email to