Repository: tika
Updated Branches:
  refs/heads/master 202f137b7 -> d011d708c


New WordPerfect and QuattroPro parsers for TIKA-1946 contributed by
pascal.essiembre

Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/87c2ef31
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/87c2ef31
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/87c2ef31

Branch: refs/heads/master
Commit: 87c2ef3191d0a86502dc249240022b3cc973aaa4
Parents: 2dbd651
Author: Pascal Essiembre <[email protected]>
Authored: Tue Dec 20 15:42:39 2016 -0500
Committer: Pascal Essiembre <[email protected]>
Committed: Tue Dec 20 15:42:39 2016 -0500

----------------------------------------------------------------------
 .../org/apache/tika/mime/tika-mimetypes.xml     |  35 +-
 .../parser/wordperfect/QPWTextExtractor.java    | 223 ++++++++++
 .../tika/parser/wordperfect/QuattroPro.java     |  72 ++++
 .../parser/wordperfect/QuattroProParser.java    |  71 +++
 .../tika/parser/wordperfect/WP6Constants.java   | 432 +++++++++++++++++++
 .../tika/parser/wordperfect/WP6FileHeader.java  | 192 +++++++++
 .../parser/wordperfect/WP6TextExtractor.java    | 218 ++++++++++
 .../tika/parser/wordperfect/WPInputStream.java  | 196 +++++++++
 .../tika/parser/wordperfect/WordPerfect.java    |  69 +++
 .../parser/wordperfect/WordPerfectParser.java   |  74 ++++
 .../services/org.apache.tika.parser.Parser      |   2 +
 .../tika/parser/wordperfect/QuattroProTest.java |  58 +++
 .../parser/wordperfect/WordPerfectTest.java     |  56 +++
 .../test-documents/testWordPerfect.wpd          | Bin 0 -> 2044 bytes
 14 files changed, 1694 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 30068da..7afdb4d 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -2585,11 +2585,28 @@
   <mime-type type="application/vnd.wmf.bootstrap"/>
   <mime-type type="application/vnd.wordperfect">
     <alias type="application/wordperfect"/>
+    <alias type="application/wordperfect5.1"/>
+    <alias type="application/wordperfect6.0"/>
+    <alias type="application/wordperfect6.1"/>
+    <alias type="application/x-corel-wordperfect"/>
+    <acronym>WPD</acronym>
+    <_comment>WordPerfect - Corel Word Processing</_comment>
+    <tika:link>http://en.wikipedia.org/wiki/WordPerfect</tika:link>
+    <tika:uti>com.corel.wordperfect.doc</tika:uti>
+    <magic priority="60">
+      <match value="0xFF575043" type="big32" offset="0"/>
+    </magic>
     <magic priority="50">
-      <match value="0xFF575043" type="string" offset="0:3"/> <!-- ÿWPC -->
+      <match value="application/vnd.wordperfect;" type="string" 
offset="0"></match>
     </magic>
     <glob pattern="*.wpd"/>
-  </mime-type>
+    <glob pattern="*.wp"/>
+    <glob pattern="*.wp5"/>
+    <glob pattern="*.wp6"/>
+    <glob pattern="*.w60"/>
+    <glob pattern="*.wp61"/>
+    <glob pattern="*.wpt"/>
+  </mime-type>  
   <mime-type type="application/vnd.wqd">
     <glob pattern="*.wqd"/>
   </mime-type>
@@ -3725,11 +3742,21 @@
   </mime-type>
 
   <mime-type type="application/x-quattro-pro">
+    <_comment>
+      Quattro Pro - Corel Spreadsheet (part of WordPerfect Office suite)
+    </_comment>
+    <!-- Conflicts with MS Word .doc format:
+    <magic priority="90">
+      <match value="0xD0CF11E0A1B11AE1" type="string" offset="0"/>
+    </magic>
+     -->
+    <magic priority="50">
+      <match value="0x00000200" type="big32" offset="0"/>
+    </magic>
     <glob pattern="*.qpw"/>
-    <glob pattern="*.wb1"/>
+    <glob pattern="*.wb1"/> 
     <glob pattern="*.wb2"/>
     <glob pattern="*.wb3"/>
-    <sub-class-of type="application/x-tika-msoffice"/>
   </mime-type>
 
   <mime-type type="application/xquery">

http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java
new file mode 100644
index 0000000..2242001
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java
@@ -0,0 +1,223 @@
+/* Copyright 2015-2016 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.wordperfect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Extracts text from a Quattro Pro document according to QPW v9 File Format.
+ * This format appears to be compatible with more recent versions too.
+ * @author Pascal Essiembre
+ */
+public class QPWTextExtractor {
+
+    private static final Logger LOG = 
+            LogManager.getLogger(QPWTextExtractor.class);
+    
+    private static final String OLE_DOCUMENT_NAME = "NativeContent_MAIN";
+
+    private enum Extractor{
+        IGNORE { @Override public void extract(Context ctx) throws IOException 
{
+            ctx.in.skipWPByte(ctx.bodyLength);
+        }},
+        BOF { @Override public void extract(Context ctx) throws IOException {
+            ctx.metadata.set(QuattroPro.ID, ctx.in.readWPString(4));
+            ctx.metadata.set(QuattroPro.VERSION, ctx.in.readWPShort());
+            ctx.metadata.set(QuattroPro.BUILD, ctx.in.readWPShort());
+            ctx.in.readWPShort(); // Last saved bits
+            ctx.metadata.set(QuattroPro.LOWEST_VERSION, ctx.in.readWPShort());
+            ctx.metadata.set(QuattroPro.PAGE_COUNT, ctx.in.readWPShort());
+            ctx.in.skipWPByte(ctx.bodyLength - 14);
+        }},
+        USER { @Override public void extract(Context ctx) throws IOException {
+            ctx.metadata.set(QuattroPro.CREATOR, getQstrLabel(ctx.in));
+            ctx.metadata.set(QuattroPro.LAST_USER, getQstrLabel(ctx.in));
+        }},
+        EXT_LINK { @Override public void extract(Context ctx) 
+                throws IOException, SAXException {
+            ctx.in.readWPShort(); // index
+            ctx.in.readWPShort(); // page first
+            ctx.in.readWPShort(); // page last
+            ctx.xhtml.characters(getQstrLabel(ctx.in));
+            ctx.xhtml.characters(System.lineSeparator());
+        }},
+        STRING_TABLE { @Override public void extract(Context ctx) 
+                throws IOException, SAXException {
+            long entries = ctx.in.readWPLong();
+            ctx.in.readWPLong();  // Total used
+            ctx.in.readWPLong();  // Total saved
+            for (int i = 0; i < entries; i++) {
+                ctx.xhtml.characters(getQstrLabel(ctx.in));
+                ctx.xhtml.characters(System.lineSeparator());
+            }
+        }},
+        BOS { @Override public void extract(Context ctx)
+                throws IOException, SAXException {
+            ctx.in.readWPShort(); // sheet #
+            ctx.in.readWPShort(); // first col index
+            ctx.in.readWPShort(); // last col index
+            ctx.in.readWPLong();  // first row index
+            ctx.in.readWPLong();  // last row index
+            ctx.in.readWPShort(); // format
+            ctx.in.readWPShort(); // flags
+            ctx.xhtml.characters(getQstrLabel(ctx.in));
+            ctx.xhtml.characters(System.lineSeparator());
+        }},
+        SHEET_HEADFOOT { @Override public void extract(Context ctx) 
+                throws IOException, SAXException {
+            ctx.in.readWPShort(); // flag
+            ctx.xhtml.characters(getQstrLabel(ctx.in));
+            ctx.xhtml.characters(System.lineSeparator());
+        }},
+        FORMULA_STRING_VALUE { @Override public void extract(Context ctx) 
+                throws IOException, SAXException {
+            ctx.in.readWPShort(); // column
+            ctx.in.readWPLong();  // row
+            ctx.xhtml.characters(getQstrLabel(ctx.in));
+        }},
+        CGENERICLABEL { @Override public void extract(Context ctx) 
+                throws IOException, SAXException {
+            ctx.in.readWPShort(); // column
+            ctx.in.readWPLong();  // row
+            ctx.in.readWPShort(); // format index
+            ctx.xhtml.characters(getQstrLabel(ctx.in));
+        }},
+        CCOMMENT { @Override public void extract(Context ctx) 
+                throws IOException, SAXException {
+            ctx.in.readWPShort(); // column
+            ctx.in.readWPLong();  // row
+            ctx.in.readWPLong();  // flag
+            ctx.xhtml.characters(getQstrLabel(ctx.in));  // author name
+            ctx.xhtml.characters(getQstrLabel(ctx.in));  // comment
+        }},
+        // Use to print out a chunk
+        DEBUG { @Override public void extract(Context ctx) throws IOException {
+            LOG.error("REC ("
+                    + Integer.toHexString(ctx.type) + "/" + ctx.bodyLength 
+                    + "):" + ctx.in.readWPString(ctx.bodyLength));
+        }};
+        public abstract void extract(Context ctx) 
+                throws IOException, SAXException;
+    }
+    
+    // Holds extractors for each record types we are interested in.
+    // All record types not defined here will be skipped.
+    private static final Map<Integer, Extractor> EXTRACTORS = 
+            new HashMap<Integer, Extractor>();
+    static {
+        //--- Global Records ---
+        EXTRACTORS.put(0x0001, Extractor.BOF);     // Beginning of file
+        EXTRACTORS.put(0x0005, Extractor.USER);    // User
+
+        //--- Notebook Records ---
+        EXTRACTORS.put(0x0403, Extractor.EXT_LINK);// External link
+        EXTRACTORS.put(0x0407, Extractor.STRING_TABLE); // String table
+
+        //--- Sheet Records ---
+        EXTRACTORS.put(0x0601, Extractor.BOS); // Beginning of sheet
+        EXTRACTORS.put(0x0605, Extractor.SHEET_HEADFOOT); // Sheet header
+        EXTRACTORS.put(0x0606, Extractor.SHEET_HEADFOOT); // Sheet footer
+
+        //--- Cells ---
+        EXTRACTORS.put(0x0c02, Extractor.FORMULA_STRING_VALUE); 
+        EXTRACTORS.put(0x0c72, Extractor.CGENERICLABEL); 
+        EXTRACTORS.put(0x0c80, Extractor.CCOMMENT); 
+    }
+    
+    class Context {
+        private final WPInputStream in;
+        private final XHTMLContentHandler xhtml;
+        private final Metadata metadata;
+        private int type;
+        private int bodyLength;
+        public Context(WPInputStream in, XHTMLContentHandler xhtml, 
+                Metadata metadata) {
+            super();
+            this.in = in;
+            this.xhtml = xhtml;
+            this.metadata = metadata;
+        }
+    }
+    
+    @SuppressWarnings("resource")
+    public void extract(
+            InputStream input, XHTMLContentHandler xhtml, Metadata metadata)
+                    throws IOException, SAXException {
+        
+        POIFSFileSystem pfs = new POIFSFileSystem(input);
+        DirectoryNode rootNode = pfs.getRoot();
+        if (rootNode == null || !rootNode.hasEntry(OLE_DOCUMENT_NAME)) {
+            LOG.info("Unsupported QuattroPro file format. "
+                    + "Looking for OLE entry \"" + OLE_DOCUMENT_NAME
+                    + "\". Found: "+ rootNode.getEntryNames());
+            return;
+        }
+        
+        //TODO shall we validate and throw warning/error if the file does not 
+        //start with a BOF and ends with a EOF?
+        xhtml.startElement("p");
+        try (WPInputStream in = new WPInputStream(
+                pfs.createDocumentInputStream(OLE_DOCUMENT_NAME))) {
+            Context ctx = new Context(in, xhtml, metadata);
+            while (hasNext(in)) {
+                ctx.type = in.readWPShort();
+                ctx.bodyLength = in.readWPShort();
+                Extractor extractor = EXTRACTORS.get(ctx.type);
+                if (extractor != null) {
+                    extractor.extract(ctx);
+                } else {
+                    // Use DEBUG to find out what we are ignoring
+                    //Extractor.DEBUG.extract(ctx);
+                    Extractor.IGNORE.extract(ctx);
+                }
+            }
+        }
+        xhtml.endElement("p");
+    }
+    
+    private boolean hasNext(InputStream in) throws IOException {
+        try {
+            in.mark(1);
+            return in.read() != -1;
+        } finally {
+            in.reset();
+        }
+    }
+    
+    private static String getQstrLabel(WPInputStream in) throws IOException {
+        // QSTR
+        int count = in.readWPShort();
+        in.readWPByte(); // string type
+        char[] text = new char[count+1];
+        text[0] = in.readWPChar();
+
+        // QSTRLABEL
+        for (int i = 0; i < count; i++) {
+            text[i+1] = in.readWPChar();
+        }
+        return new String(text);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroPro.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroPro.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroPro.java
new file mode 100644
index 0000000..8270f8d
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroPro.java
@@ -0,0 +1,72 @@
+/* Copyright 2016 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.wordperfect;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+
+/**
+ * QuattroPro properties collection.
+ * @author Pascal Essiembre
+ */
+public interface QuattroPro {
+   public static final String QUATTROPRO_METADATA_NAME_PREFIX = "wordperfect";
+
+   public static final String META_CREATOR = "creator";
+   public static final String META_LAST_USER = "last-user";
+   
+   /**
+    * ID. 
+    */
+   Property ID = Property.internalText(
+           QUATTROPRO_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "Id");
+   /**
+    * Version. 
+    */
+   Property VERSION = Property.internalInteger(
+           QUATTROPRO_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "Version");
+   /**
+    * Build. 
+    */
+   Property BUILD = Property.internalInteger(
+           QUATTROPRO_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "Build");
+   /**
+    * Lowest version. 
+    */
+   Property LOWEST_VERSION = Property.internalInteger(
+           QUATTROPRO_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "LowestVersion");
+   /**
+    * Number of pages. 
+    */
+   Property PAGE_COUNT = Property.internalInteger(
+           QUATTROPRO_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "PageCount");
+   /**
+    * Creator. 
+    */
+   Property CREATOR = Property.internalText(
+           QUATTROPRO_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "Creator");
+   /**
+    * Last User. 
+    */
+   Property LAST_USER = Property.internalText(
+           QUATTROPRO_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastUser");
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroProParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroProParser.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroProParser.java
new file mode 100644
index 0000000..735486f
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroProParser.java
@@ -0,0 +1,71 @@
+/* Copyright 2016 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.wordperfect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * <p>Parser for Corel QuattroPro documents (part of Corel WordPerfect 
+ * Office Suite).
+ * Targets QPW v9 File Format 
+ * but appears to be compatible with more recent versions too.</p>
+ * @author Pascal Essiembre 
+ */
+public class QuattroProParser extends AbstractParser {
+
+    private static final long serialVersionUID = 8941810225917012232L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("x-quattro-pro"))));
+    
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, 
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        if (metadata.get(Metadata.CONTENT_TYPE) == null) {
+            metadata.set(Metadata.CONTENT_TYPE, "application/x-quattro-pro");
+        }
+        
+        XHTMLContentHandler xhtml =
+                new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        QPWTextExtractor extractor = new QPWTextExtractor();
+        extractor.extract(stream, xhtml, metadata);
+
+        xhtml.endDocument();
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java
new file mode 100644
index 0000000..f17837c
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java
@@ -0,0 +1,432 @@
+/* Copyright 2015-2016 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.wordperfect;
+
+/**
+ * WordPerfect constant values used for parsing and extracting text.
+ * @author Pascal Essiembre
+ */
+public final class WP6Constants {
+
+
+    public static final String WP6_FILE_ID = "ÿWPC";
+    public static final int WP6_PRODUCT_TYPE = 1;
+    public static final int WP6_FILE_TYPE_WP_DOCUMENT = 10;
+    public static final int WP6_FILE_TYPE_WPD = 36;
+    
+    public static final char[] DEFAULT_EXTENDED_INTL_CHARS = new char[] {
+        '\0',     '\u00E5', '\u00C5', '\u00E6', '\u00C6',
+        '\u00E4', '\u00C4', '\u00E1', '\u00E0', '\u00E2',
+        '\u00E3', '\u00C3', '\u00E7', '\u00C7', '\u00EB', 
+        '\u00E9', '\u00C9', '\u00E8', '\u00EA', '\u00ED', 
+        '\u00F1', '\u00D1', '\u00F8', '\u00D8', '\u00F5', 
+        '\u00D5', '\u00F6', '\u00D6', '\u00FC', '\u00DC', 
+        '\u00FA', '\u00F9', '\u00DF',
+    };
+    
+    /**
+     * Extended character sets used when fixed-length multi-byte functions
+     * with a byte value of 240 (0xF0) are found in a WordPerfect document.
+     * Those character set codes may be specific to WordPerfect 
+     * file specifications and may or may not be considered standard 
+     * outside WordPerfect.
+     */
+    public static final char[][] EXTENDED_CHARSETS = new char[][] {
+        // WP Charset 0: ASCII (95 chars)
+        {
+        ' ', '"', '#', '$', '%', '&', '\'', '(', ')', '*',
+        '+', ',', '-', '.', '/', '0', '1', '2', '3', '4',
+        '5', '6', '7', '8', '9', ':', ';', '<', '=', '>',
+        '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
+        'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
+        'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\',
+        ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f',
+        'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
+        'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+        '{', '|', '}', '~', '\u00A0'
+        },
+        // WP Charset 1: Multinational (242 chars)
+        {
+        
'\u0300','\u00b7','\u0303','\u0302','\u0335','\u0338','\u0301','\u0308',
+        
'\u0304','\u0313','\u0315','\u02bc','\u0326','\u0315','\u00b0','\u0307',
+        
'\u030b','\u0327','\u0328','\u030c','\u0337','\u0305','\u0306','\u00df',
+        
'\u0138','\u006a','\u00c1','\u00e1','\u00c2','\u00e2','\u00c4','\u00e4',
+        
'\u00c0','\u00e0','\u00c5','\u00e5','\u00c6','\u00e6','\u00c7','\u00e7',
+        
'\u00c9','\u00e9','\u00ca','\u00ea','\u00cb','\u00eb','\u00c8','\u00e8',
+        
'\u00cd','\u00ed','\u00ce','\u00ee','\u00cf','\u00ef','\u00cc','\u00ec',
+        
'\u00d1','\u00f1','\u00d3','\u00f3','\u00d4','\u00f4','\u00d6','\u00f6',
+        
'\u00d2','\u00f2','\u00da','\u00fa','\u00db','\u00fb','\u00dc','\u00fc',
+        
'\u00d9','\u00f9','\u0178','\u00ff','\u00c3','\u00e3','\u0110','\u0111',
+        
'\u00d8','\u00f8','\u00d5','\u00f5','\u00dd','\u00fd','\u00d0','\u00f0',
+        
'\u00de','\u00fe','\u0102','\u0103','\u0100','\u0101','\u0104','\u0105',
+        
'\u0106','\u0107','\u010c','\u010d','\u0108','\u0109','\u010a','\u010b',
+        
'\u010e','\u010f','\u011a','\u011b','\u0116','\u0117','\u0112','\u0113',
+        
'\u0118','\u0119','\u01f4','\u01f5','\u011e','\u011f','\u01e6','\u01e7',
+        
'\u0122','\u0123','\u011c','\u011d','\u0120','\u0121','\u0124','\u0125',
+        
'\u0126','\u0127','\u0130','\u0069','\u012a','\u012b','\u012e','\u012f',
+        
'\u0128','\u0129','\u0132','\u0133','\u0134','\u0135','\u0136','\u0137',
+        
'\u0139','\u013a','\u013d','\u013e','\u013b','\u013c','\u013f','\u0140',
+        
'\u0141','\u0142','\u0143','\u0144','\u0000','\u0149','\u0147','\u0148',
+        
'\u0145','\u0146','\u0150','\u0151','\u014c','\u014d','\u0152','\u0153',
+        
'\u0154','\u0155','\u0158','\u0159','\u0156','\u0157','\u015a','\u015b',
+        
'\u0160','\u0161','\u015e','\u015f','\u015c','\u015d','\u0164','\u0165',
+        
'\u0162','\u0163','\u0166','\u0167','\u016c','\u016d','\u0170','\u0171',
+        
'\u016a','\u016b','\u0172','\u0173','\u016e','\u016f','\u0168','\u0169',
+        
'\u0174','\u0175','\u0176','\u0177','\u0179','\u017a','\u017d','\u017e',
+        
'\u017b','\u017c','\u014a','\u014b','\u0000','\u0000','\u0000','\u0000',
+        
'\u0000','\u0000','\u0000','\u0000','\u0000','\u0000','\u0000','\u0000',
+        
'\u0000','\u0000','\u1ef2','\u1ef3','\u010e','\u010f','\u01a0','\u01a1',
+        
'\u01af','\u01b0','\u0114','\u0115','\u012c','\u012d','\u0049','\u0131',
+        '\u014e','\u014f'
+        },
+        // WP Charset 2: Phonetic (145 chars)
+        {
+        
'\u02b9','\u02ba','\u02bb','\u0020','\u02bd','\u02bc','\u0020','\u02be',
+        
'\u02bf','\u0310','\u02d0','\u02d1','\u0306','\u032e','\u0329','\u02c8',
+        
'\u02cc','\u02c9','\u02ca','\u02cb','\u02cd','\u02ce','\u02cf','\u02c6',
+        
'\u02c7','\u02dc','\u0325','\u02da','\u032d','\u032c','\u0323','\u0308',
+        
'\u0324','\u031c','\u031d','\u031e','\u031f','\u0320','\u0321','\u0322',
+        
'\u032a','\u032b','\u02d2','\u02d3','\u0361','\u0356','\u005f','\u2017',
+        
'\u033e','\u02db','\u0327','\u0233','\u030d','\u02b0','\u02b6','\u0250',
+        
'\u0251','\u0252','\u0253','\u0299','\u0254','\u0255','\u0297','\u0256',
+        
'\u0257','\u0258','\u0259','\u025a','\u025b','\u025c','\u025d','\u029a',
+        
'\u025e','\u025f','\u0278','\u0261','\u0260','\u0262','\u029b','\u0263',
+        
'\u0264','\u0265','\u0266','\u0267','\u029c','\u0268','\u026a','\u0269',
+        
'\u029d','\u029e','\u026b','\u026c','\u026d','\u029f','\u026e','\u028e',
+        
'\u026f','\u0270','\u0271','\u0272','\u0273','\u0274','\u0276','\u0277',
+        
'\u02a0','\u0279','\u027a','\u027b','\u027c','\u027d','\u027e','\u027f',
+        
'\u0280','\u0281','\u0282','\u0283','\u0284','\u0285','\u0286','\u0287',
+        
'\u0288','\u0275','\u0289','\u028a','\u028c','\u028b','\u028d','\u03c7',
+        
'\u028f','\u0290','\u0291','\u0292','\u0293','\u0294','\u0295','\u0296',
+        
'\u02a1','\u02a2','\u0298','\u02a3','\u02a4','\u02a5','\u02a6','\u02a7',
+        '\u02a8'
+        },
+        // WP Charset 3: Box Drawing (88 chars)
+        {
+        
'\u2591','\u2592','\u2593','\u2588','\u258c','\u2580','\u2590','\u2584',
+        
'\u2500','\u2502','\u250c','\u2510','\u2518','\u2514','\u251c','\u252c',
+        
'\u2524','\u2534','\u253c','\u2550','\u2551','\u2554','\u2557','\u255d',
+        
'\u255a','\u2560','\u2566','\u2563','\u2569','\u256c','\u2552','\u2555',
+        
'\u255b','\u2558','\u2553','\u2556','\u255c','\u2559','\u255e','\u2565',
+        
'\u2561','\u2568','\u255f','\u2564','\u2562','\u2567','\u256b','\u256a',
+        
'\u2574','\u2575','\u2576','\u2577','\u2578','\u2579','\u257a','\u257b',
+        
'\u257c','\u257e','\u257d','\u257f','\u251f','\u2522','\u251e','\u2521',
+        
'\u252e','\u2532','\u252d','\u2531','\u2527','\u2526','\u252a','\u2529',
+        
'\u2536','\u253a','\u2535','\u2539','\u2541','\u2546','\u253e','\u2540',
+        '\u2544','\u254a','\u253d','\u2545','\u2548','\u2543','\u2549','\u2547'
+        },
+        // WP Charset 4: Typographic Symbols (102 chars)
+        {
+        
'\u25cf','\u25cb','\u25a0','\u2022','\u002a','\u00b6','\u00a7','\u00a1',
+        
'\u00bf','\u00ab','\u00bb','\u00a3','\u00a5','\u20a7','\u0192','\u00aa',
+        
'\u00ba','\u00bd','\u00bc','\u00a2','\u00b2','\u207f','\u00ae','\u00a9',
+        
'\u00a4','\u00be','\u00b3','\u201b','\u2019','\u2018','\u201f','\u201d',
+        
'\u201c','\u2013','\u2014','\u2039','\u203a','\u25cb','\u25a1','\u2020',
+        
'\u2021','\u2122','\u2120','\u211e','\u25cf','\u25e6','\u25a0','\u25aa',
+        
'\u25a1','\u25ab','\u2012','\ufb00','\ufb03','\ufb04','\ufb01','\ufb02',
+        
'\u2026','\u0024','\u20a3','\u20a2','\u20a0','\u20a4','\u201a','\u201e',
+        
'\u2153','\u2154','\u215b','\u215c','\u215d','\u215e','\u24c2','\u24c5',
+        
'\u20ac','\u2105','\u2106','\u2030','\u2116','\u2014','\u00b9','\u2409',
+        
'\u240c','\u240d','\u240a','\u2424','\u240b','\u267c','\u20a9','\u20a6',
+        
'\u20a8','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        '\u0020','\u0020','\u0020','\u0020','\u1d11','\u1d12'
+                                            // last two uncertain
+        },
+        // WP Charset 5: Iconic Symbol (255 chars)
+        {
+        
'\u2661','\u2662','\u2667','\u2664','\u2642','\u2640','\u263c','\u263a',
+        
'\u263b','\u266a','\u266c','\u25ac','\u2302','\u203c','\u221a','\u21a8',
+        
'\u2310','\u2319','\u25d8','\u25d9','\u21b5','\u2104','\u261c','\u23b5',
+        
'\u2610','\u2612','\u2639','\u266f','\u266d','\u266e','\u260e','\u231a',
+        
'\u231b','\u2701','\u2702','\u2703','\u2704','\u260e','\u2706','\u2707',
+        
'\u2708','\u2709','\u261b','\u261e','\u270c','\u270d','\u270e','\u270f',
+        
'\u2710','\u2711','\u2712','\u2713','\u2714','\u2715','\u2716','\u2717',
+        
'\u2718','\u2719','\u271a','\u271b','\u271c','\u271d','\u271e','\u271f',
+        
'\u2720','\u2721','\u2722','\u2723','\u2724','\u2725','\u2726','\u2727',
+        
'\u2605','\u2606','\u272a','\u272b','\u272c','\u272d','\u272e','\u272f',
+        
'\u2730','\u2731','\u2732','\u2733','\u2734','\u2735','\u2736','\u2737',
+        
'\u2738','\u2739','\u273a','\u273b','\u273c','\u273d','\u273e','\u273f',
+        
'\u2740','\u2741','\u2742','\u2743','\u2744','\u2745','\u2746','\u2747',
+        
'\u2748','\u2749','\u274a','\u274b','\u25cf','\u274d','\u25a0','\u274f',
+        
'\u2750','\u2751','\u2752','\u25b2','\u25bc','\u25c6','\u2756','\u25d7',
+        
'\u2758','\u2759','\u275a','\u275b','\u275c','\u275d','\u275e','\u2036',
+        
'\u2033','\u0020','\u0020','\u0020','\u0020','\u2329','\u232a','\u005b',
+        
'\u005d','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u2190','\u0020','\u0020','\u0020','\u0020','\u0020','\u21e8','\u21e6',
+        
'\u2794','\u0020','\u0020','\u0020','\u0020','\u0020','\u25d6','\u0020',
+        
'\u0020','\u2761','\u2762','\u2763','\u2764','\u2765','\u2766','\u2767',
+        
'\u2663','\u2666','\u2665','\u2660','\u2780','\u2781','\u2782','\u2783',
+        
'\u2784','\u2785','\u2786','\u2787','\u2788','\u2789','\u2776','\u2777',
+        
'\u2778','\u2779','\u277a','\u277b','\u277c','\u277d','\u277e','\u277f',
+        
'\u2780','\u2781','\u2782','\u2783','\u2784','\u2785','\u2786','\u2787',
+        
'\u2788','\u2789','\u278a','\u278b','\u278c','\u278d','\u278e','\u278f',
+        
'\u2790','\u2791','\u2792','\u2793','\u2794','\u2192','\u2194','\u2195',
+        
'\u2798','\u2799','\u279a','\u279b','\u279c','\u279d','\u279e','\u279f',
+        
'\u27a0','\u27a1','\u27a2','\u27a3','\u27a4','\u27a5','\u27a6','\u27a7',
+        
'\u27a8','\u27a9','\u27aa','\u27ab','\u27ac','\u27ad','\u27ae','\u27af',
+        
'\u0020','\u27b1','\u27b2','\u27b3','\u27b4','\u27b5','\u27b6','\u27b7',
+        '\u27b8','\u27b9','\u27ba','\u27bb','\u27bc','\u27bd','\u27be'
+        },
+        // WP Charset 6: Math/Scientific (238 chars)
+        {
+        
'\u2212','\u00b1','\u2264','\u2265','\u221d','\u002f','\u2215','\u2216',
+        
'\u00f7','\u2223','\u27e8','\u27e9','\u223c','\u2248','\u2261','\u2208',
+        
'\u2229','\u2225','\u2211','\u221e','\u00ac','\u2192','\u2190','\u2191',
+        
'\u2193','\u2194','\u2195','\u25b8','\u25c2','\u25b4','\u25be','\u22c5',
+        
'\u00b7','\u2218','\u2219','\u212b','\u00b0','\u00b5','\u203e','\u00d7',
+        
'\u222b','\u220f','\u2213','\u2207','\u2202','\u2032','\u2033','\u2192',
+        
'\u212f','\u2113','\u210f','\u2111','\u211c','\u2118','\u21c4','\u21c6',
+        
'\u21d2','\u21d0','\u21d1','\u21d3','\u21d4','\u21d5','\u2197','\u2198',
+        
'\u2196','\u2199','\u222a','\u2282','\u2283','\u2286','\u2287','\u220d',
+        
'\u2205','\u2308','\u2309','\u230a','\u230b','\u226a','\u226b','\u2220',
+        
'\u2297','\u2295','\u2296','\u2a38','\u2299','\u2227','\u2228','\u22bb',
+        
'\u22a4','\u22a5','\u2312','\u22a2','\u22a3','\u25a1','\u25a0','\u25ca',
+        
'\u25c6','\u27e6','\u27e7','\u2260','\u2262','\u2235','\u2234','\u2237',
+        
'\u222e','\u2112','\u212d','\u2128','\u2118','\u20dd','\u29cb','\u25c7',
+        
'\u22c6','\u2034','\u2210','\u2243','\u2245','\u227a','\u227c','\u227b',
+        
'\u227d','\u2203','\u2200','\u22d8','\u22d9','\u228e','\u228a','\u228b',
+        
'\u2293','\u2294','\u228f','\u2291','\u22e4','\u2290','\u2292','\u22e5',
+        
'\u25b3','\u25bd','\u25c3','\u25b9','\u22c8','\u2323','\u2322','\u25ef',
+        
'\u219d','\u21a9','\u21aa','\u21a3','\u21bc','\u21bd','\u21c0','\u21c1',
+        
'\u21cc','\u21cb','\u21bf','\u21be','\u21c3','\u21c2','\u21c9','\u21c7',
+        
'\u22d3','\u22d2','\u22d0','\u22d1','\u229a','\u229b','\u229d','\u2127',
+        
'\u2221','\u2222','\u25c3','\u25b9','\u25b5','\u25bf','\u2214','\u2250',
+        
'\u2252','\u2253','\u224e','\u224d','\u22a8','\u2258','\u226c','\u0285',
+        
'\u2605','\u226e','\u2270','\u226f','\u2271','\u2241','\u2244','\u2247',
+        
'\u2249','\u2280','\u22e0','\u2281','\u22e1','\u2284','\u2285','\u2288',
+        
'\u2289','\u0020','\u0020','\u22e2','\u22e3','\u2226','\u2224','\u226d',
+        
'\u2204','\u2209','\u2247','\u2130','\u2131','\u2102','\u0020','\u2115',
+        
'\u211d','\u225f','\u22be','\u220b','\u22ef','\u2026','\u22ee','\u22f1',
+        
'\u0020','\u20e1','\u002b','\u002d','\u003d','\u002a','\u2032','\u2033',
+        '\u2034','\u210b','\u2118','\u2272','\u2273','\u0020'
+        },
+        // WP Charset 7 Math/Scientific Extended (229 chars)
+        {
+        
'\u2320','\u2321','\u23a5','\u23bd','\u221a','\u0020','\u2211','\u220f',
+        
'\u2210','\u222b','\u222e','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u0020','\u23a7','\u23a8','\u23a9','\u23aa','\u0020','\u0020','\u0020',
+        
'\u0020','\u23ab','\u23ac','\u23ad','\u23aa','\u0020','\u0020','\u0020',
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u222a','\u222b','\u0020',
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u239b','\u239d','\u239c','\u0020','\u0020','\u0020','\u0020','\u239e',
+        
'\u23a8','\u239f','\u0020','\u0020','\u0020','\u0020','\u23a1','\u23a3',
+        
'\u23a2','\u0020','\u20aa','\u0020','\u0020','\u23a4','\u23a6','\u23a5',
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u22c3','\u22c2','\u228e','\u2a04','\u2294','\u2a06','\u2227','\u22c0',
+        
'\u2228','\u22c1','\u2297','\u2a02','\u2295','\u2a01','\u2299','\u2a00',
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u0020','\u0020','\u0020','\u0020','\u229d','\u0020','\u2238','\u0020',
+        
'\u27e6','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u27e7',
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u21bc','\u21bd','\u0020','\u296c','\u296d','\u296a','\u296b','\u0020',
+        
'\u21c9','\u21c7','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u21be','\u21bf','\u21c3','\u21c2','\u0020','\u2293','\u2a05','\u23a1',
+        '\u0020','\u0020','\u0020','\u0020','\u0020'
+        },
+        // WP Charset 8: Greek (219 chars)
+        {
+        
'\u0391','\u03b1','\u0392','\u03b2','\u0392','\u03d0','\u0393','\u03b3',
+        
'\u0394','\u03b4','\u0395','\u03b5','\u0396','\u03b6','\u0397','\u03b7',
+        
'\u0398','\u03b8','\u0399','\u03b9','\u039a','\u03ba','\u039b','\u03bb',
+        
'\u039c','\u03bc','\u039d','\u03bd','\u039e','\u03be','\u039f','\u03bf',
+        
'\u03a0','\u03c0','\u03a1','\u03c1','\u03a3','\u03c3','\u03a3','\u03c2',
+        
'\u03a4','\u03c4','\u03a5','\u03c5','\u03a6','\u03c6','\u03a7','\u03c7',
+        
'\u03a8','\u03c8','\u03a9','\u03c9','\u0386','\u03ac','\u0388','\u03ad',
+        
'\u0389','\u03ae','\u038a','\u03af','\u03aa','\u03ca','\u038c','\u03cc',
+        
'\u038e','\u03cd','\u03ab','\u03cb','\u038f','\u03ce','\u03b5','\u03d1',
+        
'\u03f0','\u03d6','\u03f1','\u03c2','\u03d2','\u03d5','\u03c9','\u037e',
+        
'\u0387','\u0374','\u0375','\u0384','\u00a8','\u0385','\u1fed','\u1fef',
+        
'\u1fc0','\u1fbd','\u1ffe','\u037a','\u1fce','\u1fde','\u1fcd','\u1fdd',
+        
'\u1fcf','\u1fdf','\u0384','\u1fef','\u1fc0','\u1fbd','\u1ffe','\u1fce',
+        
'\u1fde','\u1fcd','\u1fdd','\u1fcf','\u1fdf','\u1f70','\u1fb6','\u1fb3',
+        
'\u1fb4','\u1fb2','\u1fb7','\u1f00','\u1f04','\u1f02','\u1f06','\u1f80',
+        
'\u1f84','\u1f82','\u1f86','\u1f01','\u1f05','\u1f03','\u1f07','\u1f81',
+        
'\u1f85','\u1f83','\u1f87','\u1f72','\u1f10','\u1f14','\u1f12','\u1f11',
+        
'\u1f15','\u1f13','\u1f74','\u1fc6','\u1fc3','\u1fc4','\u1fc2','\u1fc7',
+        
'\u1f20','\u1f24','\u1f22','\u1f26','\u1f90','\u1f94','\u1f92','\u1f96',
+        
'\u1f21','\u1f25','\u1f23','\u1f27','\u1f91','\u1f95','\u1f93','\u1f97',
+        
'\u1f76','\u1fd6','\u0390','\u1fd2','\u1f30','\u1f34','\u1f32','\u1f36',
+        
'\u1f31','\u1f35','\u1f33','\u1f37','\u1f78','\u1f40','\u1f44','\u1f42',
+        
'\u1f41','\u1f45','\u1f43','\u1fe5','\u1fe4','\u1f7a','\u1fe6','\u03b0',
+        
'\u1fe2','\u1f50','\u1f54','\u1f52','\u1f56','\u1f51','\u1f55','\u1f53',
+        
'\u1f57','\u1f7c','\u1ff6','\u1ff3','\u1ff4','\u1ff2','\u1ff7','\u1f60',
+        
'\u1f64','\u1f62','\u1f66','\u1fa0','\u1fa4','\u1fa2','\u1fa6','\u1f61',
+        
'\u1f65','\u1f63','\u1f67','\u1fa1','\u1fa5','\u1fa3','\u1fa7','\u03da',
+        '\u03dc','\u03de','\u03e0'
+        },
+        // WP Charset 9: Hebrew (123 chars)
+        {
+        
'\u05d0','\u05d1','\u05d2','\u05d3','\u05d4','\u05d5','\u05d6','\u05d7',
+        
'\u05d8','\u05d9','\u05da','\u05db','\u05dc','\u05dd','\u05de','\u05df',
+        
'\u05e0','\u05e1','\u05e2','\u05e3','\u05e4','\u05e5','\u05e6','\u05e7',
+        
'\u05e8','\u05e9','\u05ea','\u05be','\u05c0','\u05c3','\u05f3','\u05f4',
+        
'\u05b0','\u05b1','\u05b2','\u05b3','\u05b4','\u05b5','\u05b6','\u05b7',
+        
'\u05b8','\u05b9','\u05b9','\u05bb','\u05bc','\u05bd','\u05bf','\u05b7',
+        
'\ufb1e','\u05f0','\u05f1','\u05f2','\ufb1f','\u0591','\u0596','\u0020',
+        
'\u05a4','\u059a','\u059b','\u05a3','\u05a5','\u05a6','\u05a7','\u05a2',
+        
'\u0592','\u0593','\u0594','\u0595','\u0597','\u0598','\u0599','\u05a8',
+        
'\u05f3','\u05f3','\u05f4','\u0020','\u05a9','\u05a0','\u059f','\u05ab',
+        
'\u05ac','\u05af','\u05c4','\u05aa','\ufb30','\ufb31','\u05d1','\ufb32',
+        
'\ufb33','\ufb34','\ufb35','\ufb4b','\ufb36','\u05d7','\ufb38','\ufb39',
+        
'\ufb1d','\ufb3b','\ufb3a','\u05da','\u05da','\u05da','\u05da','\u05da',
+        
'\u05da','\ufb3c','\ufb3e','\ufb40','\u05d5','\ufb41','\ufb44','\u05e4',
+        
'\ufb46','\ufb47','\ufb2b','\ufb2d','\ufb2a','\ufb2c','\ufb4a','\u05dc',
+        '\ufb3c','\ufb49','\u20aa'
+        },
+        // WP Charset 10: Cyrillic (250 chars)
+        {
+        
'\u0410','\u0430','\u0411','\u0431','\u0412','\u0432','\u0413','\u0433',
+        
'\u0414','\u0434','\u0415','\u0435','\u0401','\u0451','\u0416','\u0436',
+        
'\u0417','\u0437','\u0418','\u0438','\u0419','\u0439','\u041a','\u043a',
+        
'\u041b','\u043b','\u041c','\u043c','\u041d','\u043d','\u041e','\u043e',
+        
'\u041f','\u043f','\u0420','\u0440','\u0421','\u0441','\u0422','\u0442',
+        
'\u0423','\u0443','\u0424','\u0444','\u0425','\u0445','\u0426','\u0446',
+        
'\u0427','\u0447','\u0428','\u0448','\u0429','\u0449','\u042a','\u044a',
+        
'\u042b','\u044b','\u042c','\u044c','\u042d','\u044d','\u042e','\u044e',
+        
'\u042f','\u044f','\u04d8','\u04d9','\u0403','\u0453','\u0490','\u0491',
+        
'\u0492','\u0493','\u0402','\u0452','\u0404','\u0454','\u0404','\u0454',
+        
'\u0496','\u0497','\u0405','\u0455','\u0020','\u0020','\u0418','\u0438',
+        
'\u0406','\u0456','\u0407','\u0457','\u0020','\u0020','\u0408','\u0458',
+        
'\u040c','\u045c','\u049a','\u049b','\u04c3','\u04c4','\u049c','\u049d',
+        
'\u0409','\u0459','\u04a2','\u04a3','\u040a','\u045a','\u047a','\u047b',
+        
'\u0460','\u0461','\u040b','\u045b','\u040e','\u045e','\u04ee','\u04ef',
+        
'\u04ae','\u04af','\u04b0','\u04b1','\u0194','\u0263','\u04b2','\u04b3',
+        
'\u0425','\u0445','\u04ba','\u04bb','\u047e','\u047f','\u040f','\u045f',
+        
'\u04b6','\u04b7','\u04b8','\u04b9','\u0428','\u0448','\u0462','\u0463',
+        
'\u0466','\u0467','\u046a','\u046b','\u046e','\u046f','\u0470','\u0471',
+        
'\u0472','\u0473','\u0474','\u0475','\u0410','\u0430','\u0415','\u0435',
+        
'\u0404','\u0454','\u0418','\u0438','\u0406','\u0456','\u0407','\u0457',
+        
'\u041e','\u043e','\u0423','\u0443','\u042b','\u044b','\u042d','\u044d',
+        
'\u042e','\u044e','\u042f','\u044f','\u0410','\u0430','\u0400','\u0450',
+        
'\u0401','\u0451','\u040d','\u045d','\u041e','\u043e','\u0423','\u0443',
+        
'\u042b','\u044b','\u042d','\u044d','\u042e','\u044e','\u042f','\u044f',
+        
'\u0301','\u0300','\u0308','\u0306','\u0326','\u0328','\u0304','\u0020',
+        
'\u201e','\u201c','\u10d0','\u10d1','\u10d2','\u10d3','\u10d4','\u10d5',
+        
'\u10d6','\u10f1','\u10d7','\u10d8','\u10d9','\u10da','\u10db','\u10dc',
+        
'\u10f2','\u10dd','\u10de','\u10df','\u10e0','\u10e1','\u10e2','\u10e3',
+        
'\u10f3','\u10e4','\u10e5','\u10e6','\u10e7','\u10e8','\u10e9','\u10ea',
+        
'\u10eb','\u10ec','\u10ed','\u10ee','\u10f4','\u10ef','\u10f0','\u10f5',
+        '\u10f6','\u10e3'
+        },
+        // WP Charset 11: Japanese (63 chars)
+        {
+        
'\uff61','\uff62','\uff63','\uff64','\uff65','\uff66','\uff67','\uff68',
+        
'\uff69','\uff6a','\uff6b','\uff6c','\uff6d','\uff6e','\uff6f','\uff70',
+        
'\uff71','\uff72','\uff73','\uff74','\uff75','\uff76','\uff77','\uff78',
+        
'\uff79','\uff7a','\uff7b','\uff7c','\uff7d','\uff7e','\uff7f','\uff80',
+        
'\uff81','\uff82','\uff83','\uff84','\uff85','\uff86','\uff87','\uff88',
+        
'\uff89','\uff8a','\uff8b','\uff8c','\uff8d','\uff8e','\uff8f','\uff90',
+        
'\uff91','\uff92','\uff93','\uff94','\uff95','\uff96','\uff97','\uff98',
+        '\uff99','\uff9a','\uff9b','\uff9c','\uff9d','\uff9e','\uff9f'
+        },
+        // WP Charset 12: Current Font Symbols (256 chars)
+        {  //TODO implement Current Font Symbols
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
+        ' '
+        },
+        // WP Charset 13: Arabic (196 chars)
+        {
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u0020','\u0020','\u064e','\ufe77','\u064f','\ufe79','\u0650','\ufe7b',
+        
'\u064b','\u064c','\u064c','\u064d','\u0652','\ufe7f','\u0651','\ufe7d',
+        
'\ufc60','\ufcf2','\ufc61','\ufcf3','\ufc62','\ufcf4','\u064b','\ufc5e',
+        
'\ufc5e','\ufc5f','\u0653','\u0670','\u0654','\u0020','\u060c','\u061b',
+        
'\u061f','\u066d','\u066a','\u00bb','\u00ab','\u0029','\u0028','\u0661',
+        
'\u0662','\u0663','\u0664','\u0665','\u0666','\u0667','\u0668','\u0669',
+        
'\u0660','\u0662','\u0627','\ufe8e','\u0628','\ufe91','\ufe92','\ufe90',
+        
'\u062a','\ufe97','\ufe98','\ufe96','\u062b','\ufe9b','\ufe9c','\ufe9a',
+        
'\u062c','\ufe9f','\ufea0','\ufe9e','\u062d','\ufea3','\ufea4','\ufea2',
+        
'\u062e','\ufea7','\ufea8','\ufea6','\u062f','\ufeaa','\u0630','\ufeac',
+        
'\u0631','\ufeae','\u0632','\ufeaf','\u0633','\ufeb3','\ufeb4','\ufeb2',
+        
'\u0634','\ufeb7','\ufeb8','\ufeb6','\u0635','\ufebb','\ufebc','\ufeba',
+        
'\u0636','\ufebf','\ufec0','\ufebe','\u0637','\ufec3','\ufec4','\ufec2',
+        
'\u0638','\ufec7','\ufec8','\ufec6','\u0639','\ufecb','\ufecc','\ufeca',
+        
'\u063a','\ufecf','\ufed0','\ufece','\u0641','\ufed3','\ufed4','\ufed2',
+        
'\u0642','\ufed7','\ufed8','\ufed6','\u0643','\ufedb','\ufedc','\ufeda',
+        
'\u0644','\ufedf','\ufee0','\ufede','\u0645','\ufee3','\ufee4','\ufee2',
+        
'\u0646','\ufee7','\ufee8','\ufee6','\u0647','\ufeeb','\ufeec','\ufeea',
+        
'\u0629','\ufe94','\u0648','\ufeee','\u064a','\ufef3','\ufef4','\ufef2',
+        
'\u0649','\ufef3','\ufef4','\ufef0','\u0621','\u0623','\ufe84','\u0625',
+        
'\ufe88','\u0624','\ufe86','\u0626','\ufe8b','\ufe8c','\ufe8a','\ufd3d',
+        
'\ufd3c','\u0622','\ufe82','\u0671','\ufb51','\ufefb','\ufefc','\ufef7',
+        
'\ufef8','\ufef9','\ufefa','\u0020','\ufefc','\ufef5','\ufef6','\u0020',
+        '\u0020','\ufdf2','\u0640','\u0640'
+        },
+        // WP Charset 14: Arabic Script (220 chars)
+        {
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0615',
+        
'\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u0615','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020',
+        
'\u0020','\u06d4','\u0020','\u0020','\u00b0','\u0020','\u065a','\u0020',
+        
'\u065a','\u0659','\u0020','\u0020','\u0654','\u064c','\ufc5e','\u065a',
+        
'\u065a','\u06f4','\u06f4','\u06f5','\u06f6','\u06f6','\u06f7','\u06f8',
+        
'\u067b','\ufb54','\ufb55','\ufb53','\u0680','\ufb5c','\ufb5d','\ufb5b',
+        
'\u067e','\ufb58','\ufb59','\ufb57','\u0679','\ufb68','\ufb69','\ufb67',
+        
'\u067c','\u067c','\u067c','\u067c','\u067f','\ufb64','\ufb65','\ufb63',
+        
'\u067d','\u067d','\u067d','\u067d','\u067a','\ufb60','\ufb61','\ufb5f',
+        
'\u0684','\ufb74','\ufb75','\ufb73','\u0683','\ufb78','\ufb79','\ufb77',
+        
'\u0686','\ufb7c','\ufb7d','\ufb7b','\u0687','\ufb80','\ufb81','\ufb7f',
+        
'\u0685','\u0685','\u0685','\u0685','\u0681','\u0681','\u0681','\u0681',
+        
'\u0688','\ufb89','\u0689','\u0689','\u068c','\ufb85','\u068e','\ufb87',
+        
'\u068a','\u068a','\u068d','\ufb83','\u0693','\u0693','\u0691','\ufb8d',
+        
'\u0699','\u0699','\u0695','\u0695','\u0692','\u0692','\u0698','\ufb8b',
+        
'\u0696','\u0696','\u075b','\u075b','\u069a','\u069a','\u069a','\u069a',
+        
'\u06a0','\u06a0','\u06a0','\u06a0','\u06a4','\ufb6c','\ufb6d','\ufb6b',
+        
'\u06a6','\ufb70','\ufb71','\ufb6f','\u06a9','\ufb90','\ufb91','\ufb8f',
+        
'\u0643','\ufedb','\ufedc','\ufeda','\u06aa','\u06aa','\u06aa','\u06aa',
+        
'\u06af','\ufb94','\ufb95','\ufb93','\u06af','\ufb94','\ufb95','\ufb93',
+        
'\u06ab','\u06ab','\u06ab','\u06ab','\u06b1','\ufb9c','\ufb9d','\ufb9b',
+        
'\u06b3','\ufb98','\ufb99','\ufb97','\u06b5','\u06b5','\u06b5','\u06b5',
+        
'\u0020','\u0020','\u06ba','\u0020','\u0020','\ufb9f','\u06bc','\u06bc',
+        
'\u06bc','\u06bc','\u06bb','\ufba2','\ufba3','\ufba1','\u06c6','\ufbda',
+        
'\u0020','\u0020','\u06ca','\u06ca','\u06c1','\ufba8','\ufba9','\ufba7',
+        
'\u06ce','\u06ce','\u06ce','\u06ce','\u06d2','\ufbaf','\u06d1','\u06d1',
+        '\u06d1','\u06d1','\u06c0','\ufba5'
+        },
+    }; 
+
+    
+    /**
+     * Constructor.
+     */
+    private WP6Constants() {
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java
new file mode 100644
index 0000000..4b81256
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java
@@ -0,0 +1,192 @@
+/* Copyright 2015-2016 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.wordperfect;
+
+import org.apache.commons.lang.builder.ToStringBuilder;
+
+/**
+ * WordPerfect file header.
+ * @author Pascal Essiembre
+ */
+public class WP6FileHeader {
+
+    // Normal header
+    private String fileId;
+    private long docAreaPointer;
+    private int productType;
+    private int fileType;
+    private int majorVersion;
+    private int minorVersion;
+    private boolean encrypted;
+    private int indexAreaPointer;
+
+    // Extended header
+    private long fileSize;
+    
+    public WP6FileHeader() {
+        super();
+    }
+
+    public String getFileId() {
+        return fileId;
+    }
+
+    public void setFileId(String fileId) {
+        this.fileId = fileId;
+    }
+
+    public long getDocAreaPointer() {
+        return docAreaPointer;
+    }
+
+    public void setDocAreaPointer(long docAreaPointer) {
+        this.docAreaPointer = docAreaPointer;
+    }
+
+    public int getProductType() {
+        return productType;
+    }
+
+    public void setProductType(int productType) {
+        this.productType = productType;
+    }
+
+    public int getFileType() {
+        return fileType;
+    }
+
+    public void setFileType(int fileType) {
+        this.fileType = fileType;
+    }
+    
+    public int getMajorVersion() {
+        return majorVersion;
+    }
+
+    public void setMajorVersion(int majorVersion) {
+        this.majorVersion = majorVersion;
+    }
+
+    public int getMinorVersion() {
+        return minorVersion;
+    }
+
+    public void setMinorVersion(int minorVersion) {
+        this.minorVersion = minorVersion;
+    }
+    
+    public boolean isEncrypted() {
+        return encrypted;
+    }
+
+    public void setEncrypted(boolean encrypted) {
+        this.encrypted = encrypted;
+    }
+
+    public int getIndexAreaPointer() {
+        return indexAreaPointer;
+    }
+
+    public void setIndexAreaPointer(int indexAreaPointer) {
+        this.indexAreaPointer = indexAreaPointer;
+    }
+    
+    public long getFileSize() {
+        return fileSize;
+    }
+    public void setFileSize(long fileSize) {
+        this.fileSize = fileSize;
+    }
+
+    @Override
+    public String toString() {
+        ToStringBuilder builder = new ToStringBuilder(this);
+        builder.append("fileId", fileId);
+        builder.append("docAreaPointer", docAreaPointer);
+        builder.append("productType", productType);
+        builder.append("fileType", fileType);
+        builder.append("majorVersion", majorVersion);
+        builder.append("minorVersion", minorVersion);
+        builder.append("encrypted", encrypted);
+        builder.append("indexAreaPointer", indexAreaPointer);
+        builder.append("fileSize", fileSize);
+        return builder.toString();
+    }
+
+    @Override
+    public int hashCode() {
+        final int prime = 31;
+        int result = 1;
+        result = prime * result
+                + (int) (docAreaPointer ^ (docAreaPointer >>> 32));
+        result = prime * result + (encrypted ? 1231 : 1237);
+        result = prime * result + ((fileId == null) ? 0 : fileId.hashCode());
+        result = prime * result + (int) (fileSize ^ (fileSize >>> 32));
+        result = prime * result + fileType;
+        result = prime * result + indexAreaPointer;
+        result = prime * result + majorVersion;
+        result = prime * result + minorVersion;
+        result = prime * result + productType;
+        return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj) {
+            return true;
+        }
+        if (obj == null) {
+            return false;
+        }
+        if (!(obj instanceof WP6FileHeader)) {
+            return false;
+        }
+        WP6FileHeader other = (WP6FileHeader) obj;
+        if (docAreaPointer != other.docAreaPointer) {
+            return false;
+        }
+        if (encrypted != other.encrypted) {
+            return false;
+        }
+        if (fileId == null) {
+            if (other.fileId != null) {
+                return false;
+            }
+        } else if (!fileId.equals(other.fileId)) {
+            return false;
+        }
+        if (fileSize != other.fileSize) {
+            return false;
+        }
+        if (fileType != other.fileType) {
+            return false;
+        }
+        if (indexAreaPointer != other.indexAreaPointer) {
+            return false;
+        }
+        if (majorVersion != other.majorVersion) {
+            return false;
+        }
+        if (minorVersion != other.minorVersion) {
+            return false;
+        }
+        if (productType != other.productType) {
+            return false;
+        }
+        return true;
+    }
+
+    
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java
new file mode 100644
index 0000000..1a2198d
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java
@@ -0,0 +1,218 @@
+/* Copyright 2015-2016 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.wordperfect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Extracts text from a WordPerfect document according to WP6 File Format.
+ * This format appears to be compatible with more recent versions too.
+ * @author Pascal Essiembre
+ */
+public class WP6TextExtractor {
+
+    public void extract(
+            InputStream input, XHTMLContentHandler xhtml, Metadata metadata) 
+            throws IOException, SAXException {
+        WPInputStream in = new WPInputStream(input);
+        
+        WP6FileHeader header = parseFileHeader(in);
+
+        applyMetadata(header, metadata);
+        
+        // For text extraction we can safely ignore WP Index Area and
+        // Packet Data Area and jump right away to Document Area.
+        extractDocumentText(in, header.getDocAreaPointer(), xhtml);
+        
+    }
+
+    private void applyMetadata(WP6FileHeader header, Metadata metadata) {
+        metadata.set(WordPerfect.FILE_SIZE, 
+                Long.toString(header.getFileSize()));
+        metadata.set(WordPerfect.FILE_ID, header.getFileId());
+        metadata.set(WordPerfect.PRODUCT_TYPE, header.getProductType());
+        metadata.set(WordPerfect.FILE_TYPE, header.getFileType());
+        metadata.set(WordPerfect.MAJOR_VERSION, header.getMajorVersion());
+        metadata.set(WordPerfect.MINOR_VERSION, header.getMinorVersion());
+        metadata.set(WordPerfect.ENCRYPTED, 
+                Boolean.toString(header.isEncrypted()));
+    }
+        
+    private void extractDocumentText(
+            WPInputStream in, long offset, XHTMLContentHandler xhtml) 
+                    throws IOException, SAXException {
+        xhtml.startElement("p");
+        
+        // Move to offset (for some reason skip() did not work).
+        for (int i = 0; i < offset; i++) {
+            in.readWPByte();
+        }
+
+        int chunk = 4096;
+        StringBuilder out = new StringBuilder(chunk);
+        
+        int c;
+        while ((c = in.read()) != -1) {
+            if (c > 0 && c <= 32) {
+                out.append(WP6Constants.DEFAULT_EXTENDED_INTL_CHARS[c]);
+            } else if (c >= 33 && c <= 126) {
+                out.append((char) c);
+            } else if (c == 128) {
+                out.append(' ');      // Soft space
+            } else if (c == 129) {
+                out.append('\u00A0'); // Hard space
+            } else if (c == 129) {
+                out.append('-');      // Hard hyphen
+            } else if (c == 135 || c == 137) {
+                out.append('\n');      // Dormant Hard return
+            } else if (c == 138) {
+                // skip to closing pair surrounding page number
+                skipUntilChar(in, 139);
+            } else if (c == 198) {
+                // end of cell
+                out.append('\t');
+            } else if (c >= 180 && c <= 207) {
+                out.append('\n');
+            } else if (c >= 208 && c <= 239) {
+                // Variable-Length Multi-Byte Functions
+                int subgroup = in.read();
+                int functionSize = in.readWPShort();
+                for (int i = 0; i < functionSize - 4; i++) {
+                    in.read(); 
+                }
+                
+                // End-of-Line group
+                if (c == 208) {
+                    if (subgroup >= 1 && subgroup <= 3) {
+                        out.append(' ');
+                    } else if (subgroup == 10) {
+                        // end of cell
+                        out.append('\t');
+                    } else if (subgroup >= 4 && subgroup <= 19) {
+                        out.append('\n');
+                    } else if (subgroup >= 20 && subgroup <= 22) {
+                        out.append(' ');
+                    } else if (subgroup >= 23 && subgroup <= 28) {
+                        out.append('\n');
+                    }
+                } else if (c == 213) {
+                    out.append(' ');
+                } else if (c == 224) {
+                    out.append('\t');
+                }
+                //TODO Are there functions containing data? Like footnotes?
+                
+            } else if (c == 240) {
+                // extended char
+                int charval = in.read();
+                int charset = in.read();
+                in.read(); // closing character
+  
+                //TODO implement all charsets
+                if (charset == 4 || charset == 5) {
+                    out.append(
+                            WP6Constants.EXTENDED_CHARSETS[charset][charval]);
+                } else {
+                    out.append("[TODO:charset" + charset + "]");
+                }
+            } else if (c >= 241 && c <= 254) {
+                skipUntilChar(in, c);
+            } else if (c == 255) {
+                skipUntilChar(in, c);
+            }
+            
+            if (out.length() >= chunk) {
+                xhtml.characters(out.toString());
+                out.setLength(0);
+            }
+        }
+        
+        // Ignored codes above 127:
+        
+        // 130,131,133: soft hyphens
+        // 134: invisible return in line
+        // 136: soft end of center/align
+        // 140: style separator mark
+        // 141,142: start/end of text to skip
+        // 143: exited hyphenation
+        // 144: cancel hyphenation
+        // 145-151: match functions
+        // 152-179: unknown/ignored
+        // 255: reserved, cannot be used
+        
+        xhtml.characters(out.toString());
+        out.setLength(0);
+        xhtml.endElement("p");
+    }
+
+    // Skips until the given character is encountered.
+    private int skipUntilChar(WPInputStream in, int targetChar)
+            throws IOException {
+        int count = 0;
+        int c;
+        while ((c = in.read()) != -1) {
+            count++;
+            if (c == targetChar) {
+                return count;
+            }
+        }
+        return count;
+    }
+    
+    private WP6FileHeader parseFileHeader(WPInputStream in) 
+            throws IOException {
+        WP6FileHeader header = new WP6FileHeader();
+
+        // File header
+        in.mark(30);
+        header.setFileId(in.readWPString(4));         // 1-4
+        header.setDocAreaPointer(in.readWPLong());    // 5-8
+        header.setProductType(in.read());             // 9
+        header.setFileType(in.readWPChar());          // 10
+        header.setMajorVersion(in.read());            // 11
+        header.setMinorVersion(in.read());            // 12
+        header.setEncrypted(in.readWPShort() != 0);   // 13-14
+        header.setIndexAreaPointer(in.readWPShort()); // 15-16
+        try {
+            in.skip(4); // 4 reserved bytes: skip     // 17-20
+            header.setFileSize(in.readWPLong());      // 21-24
+        } catch (IOException e) {
+            // May fail if not extended error, which is fine.
+        }
+        in.reset();
+        
+        //TODO header may be shared between corel products, so move validation
+        //specific to each product elsewhere?
+        //TODO convert to logs only, and let it fail elsewhere?
+//        if (!WP6Constants.WP6_FILE_ID.equals(header.getFileId())) {
+//            throw new IOException("Not a WordPerfect file. File must start "
+//                    + "with " + WP6Constants.WP6_FILE_ID + " but was "
+//                    + header.getFileId());
+//        }
+//        if (WP6Constants.WP6_PRODUCT_TYPE != header.getProductType()) {
+//            throw new IOException("Not a WordPerfect file. Product type "
+//                    + "must be " + WP6Constants.WP6_PRODUCT_TYPE + " but was 
"
+//                    + header.getProductType());
+//        }
+        //TODO perform file type validation?
+        return header;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java
new file mode 100644
index 0000000..2da276b
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java
@@ -0,0 +1,196 @@
+/* Copyright 2015-2016 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.wordperfect;
+
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.lang.StringUtils;
+
+/**
+ * {@link InputStream} wrapper adding WordPerfect-specific byte-reading 
methods.
+ * @author Pascal Essiembre
+ */
+public class WPInputStream extends InputStream {
+
+    private final DataInputStream in;
+    
+    /**
+     * Constructor.
+     * @param in input stream
+     */
+    public WPInputStream(InputStream in) {
+        BufferedInputStream bis = null;
+        if (BufferedInputStream.class.isAssignableFrom(in.getClass())) {
+            bis = (BufferedInputStream) in;
+        } else {
+            bis = new BufferedInputStream(in);    
+        }
+        this.in = new DataInputStream(bis);
+    }
+
+    /**
+     * Reads a WordPerfect "short": a 2 bytes (16-bit) unsigned value in 
+     * reverse order.
+     * @return an integer value
+     * @throws IOException if not enough bytes remain
+     */
+    public int readWPShort() throws IOException {
+        int ch1 = in.read();
+        int ch2 = in.read();
+        if ((ch1 | ch2) < 0) {
+            throw new EOFException();
+        }
+        return (ch2 << 8) + (ch1 << 0);
+    }
+
+    /**
+     * Reads a WordPerfect "long": a 4 bytes (32-bit) unsigned value in 
+     * reverse order.
+     * @return a long value
+     * @throws IOException if not enough bytes remain
+     */
+    public long readWPLong() throws IOException {
+        int ch1 = in.read();
+        int ch2 = in.read();
+        int ch3 = in.read();
+        int ch4 = in.read();
+        if ((ch1 | ch2 | ch3 | ch4) < 0) {
+            throw new EOFException();
+        }
+        return ((ch4 << 24) + (ch3 << 16) + (ch2 << 8) + (ch1 << 0)); 
+    }
+
+    /**
+     * Reads a WordPerfect byte (8-bit).
+     * @return byte value
+     * @throws IOException if not enough bytes remain
+     */
+    public byte readWPByte() throws IOException {
+        return in.readByte();
+    }
+
+    /**
+     * Skips the specified number of WordPerfect byte (8-bit).
+     * @param numOfBytes number of bytes to skip
+     * @throws IOException if not enough bytes remain
+     */
+    public void skipWPByte(int numOfBytes) throws IOException {
+        for (int i = 0; i < numOfBytes; i++) {
+            readWPByte();
+        }
+    }
+
+    /**
+     * Reads a WordPerfect character (8-bit).
+     * @return character
+     * @throws IOException if not enough bytes remain
+     */
+    public char readWPChar() throws IOException {
+        return (char) in.read();
+    }
+
+    /**
+     * Reads a WordPerfect string of specified length (1 byte per character).
+     * @param length how many characters to read
+     * @return a string 
+     * @throws IOException if not enough bytes remain
+     */
+    public String readWPString(int length) throws IOException {
+        char[] chars = new char[length];
+        for (int i = 0; i < length; i++) {
+            int c = in.read();
+            if (c == -1) {
+                throw new EOFException();
+            }
+            chars[i] = (char) c;
+        }
+        return new String(chars);
+    }
+
+    /**
+     * Reads a series of bytes of the specified length, converting
+     * each byte to its hexadecimal representation.
+     * converting each characters to .
+     * @param numOfBytes how many byte to read
+     * @return an hexadecimal string
+     * @throws IOException if not enough bytes remain
+     */
+    public String readWPHexString(int numOfBytes) throws IOException {
+        StringBuilder b = new StringBuilder();
+        for (int i = 0; i < numOfBytes; i++) {
+            b.append(readWPHex());
+        }
+        return b.toString();
+    }
+
+    /**
+     * Reads the next byte and returns it as an hexadecimal value.
+     * @return hexadecimal string for a single byte
+     * @throws IOException if not enough bytes remain
+     */
+    public String readWPHex() throws IOException {
+        return StringUtils.leftPad(Integer.toString(read(), 16), 2, '0');
+    }
+    
+    
+    @Override
+    public int read() throws IOException {
+        return in.read();
+    }
+
+    @Override
+    public int read(byte[] b) throws IOException {
+        return in.read(b);
+    }
+
+    @Override
+    public int read(byte[] b, int off, int len) throws IOException {
+        return in.read(b, off, len);
+    }
+
+    @Override
+    public long skip(long n) throws IOException {
+        return in.skip(n);
+    }
+
+    @Override
+    public int available() throws IOException {
+        return in.available();
+    }
+
+    @Override
+    public void close() throws IOException {
+        in.close();
+    }
+
+    @Override
+    public synchronized void mark(int readlimit) {
+        in.mark(readlimit);
+    }
+
+    @Override
+    public synchronized void reset() throws IOException {
+        in.reset();
+    }
+
+    @Override
+    public boolean markSupported() {
+        return in.markSupported();
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfect.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfect.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfect.java
new file mode 100644
index 0000000..aadbd35
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfect.java
@@ -0,0 +1,69 @@
+/* Copyright 2016 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.wordperfect;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+
+/**
+ * WordPerfect properties collection.
+ * @author Pascal Essiembre
+ */
+public interface WordPerfect {
+   public static final String WORDPERFECT_METADATA_NAME_PREFIX = "wordperfect";
+
+   /**
+    * File size as defined in document header. 
+    */
+   Property FILE_SIZE = Property.internalText(
+           WORDPERFECT_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileSize");
+   /**
+    * File identifier. 
+    */
+   Property FILE_ID = Property.internalText(
+           WORDPERFECT_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileId");
+   /**
+    * Product type. 
+    */
+   Property PRODUCT_TYPE = Property.internalInteger(
+           WORDPERFECT_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "ProductType");
+   /**
+    * File type. 
+    */
+   Property FILE_TYPE = Property.internalInteger(
+           WORDPERFECT_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileType");
+   /**
+    * Major version. 
+    */
+   Property MAJOR_VERSION = Property.internalInteger(
+           WORDPERFECT_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "MajorVersion");
+   /**
+    * Minor version. 
+    */
+   Property MINOR_VERSION = Property.internalInteger(
+           WORDPERFECT_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "MinorVersion");
+   /**
+    * Is encrypted?. 
+    */
+   Property ENCRYPTED = Property.internalBoolean(
+           WORDPERFECT_METADATA_NAME_PREFIX
+                   + Metadata.NAMESPACE_PREFIX_DELIMITER + "Encrypted");
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfectParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfectParser.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfectParser.java
new file mode 100644
index 0000000..105e803
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfectParser.java
@@ -0,0 +1,74 @@
+/* Copyright 2016 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.wordperfect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * <p>Parser for Corel WordPerfect documents. Targets WP6 File Format
+ * but appears to be compatible with more recent versions too.</p>
+ * @author Pascal Essiembre 
+ */
+public class WordPerfectParser extends AbstractParser {
+
+    private static final long serialVersionUID = 8941810225917012232L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("vnd.wordperfect"),
+                    MediaType.application("wordperfect"),
+                    MediaType.application("wordperfect5.1"),
+                    MediaType.application("wordperfect6.0"),
+                    MediaType.application("wordperfect6.1"),
+                    MediaType.application("x-corel-wordperfect"))));
+    
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, 
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        if (metadata.get(Metadata.CONTENT_TYPE) == null) {
+            metadata.set(Metadata.CONTENT_TYPE, "application/wordperfect");
+        }
+        
+        XHTMLContentHandler xhtml =
+                new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        WP6TextExtractor extractor = new WP6TextExtractor();
+        extractor.extract(stream, xhtml, metadata);
+
+        xhtml.endDocument();
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 4d6e6d4..602ee2c 100644
--- 
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ 
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -57,6 +57,8 @@ org.apache.tika.parser.pkg.RarParser
 org.apache.tika.parser.rtf.RTFParser
 org.apache.tika.parser.txt.TXTParser
 org.apache.tika.parser.video.FLVParser
+org.apache.tika.parser.wordperfect.QuattroProParser
+org.apache.tika.parser.wordperfect.WordPerfectParser
 org.apache.tika.parser.xml.DcXMLParser
 org.apache.tika.parser.dif.DIFParser
 org.apache.tika.parser.xml.FictionBookParser

http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java
new file mode 100644
index 0000000..ea5d12f
--- /dev/null
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java
@@ -0,0 +1,58 @@
+/* Copyright 2016 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.wordperfect;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.StringWriter;
+
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.junit.Test;
+
+/**
+ * Junit test class for the {@link WordPerfectParser}.
+ * @author Pascal Essiembre
+ */
+public class QuattroProTest extends TikaTest {
+
+    private Tika tika = new Tika();
+
+    //TODO add testWB/testQUATTRO.wb3 if .wb? files get supported
+    
+    @Test
+    public void testQPW() throws Exception {
+        File file = getResourceAsFile("/test-documents/testQUATTRO.qpw");
+
+        Metadata metadata = new Metadata();
+        StringWriter writer = new StringWriter();
+        tika.getParser().parse(
+                new FileInputStream(file),
+                new WriteOutContentHandler(writer),
+                metadata,
+                new ParseContext());
+        String content = writer.toString();
+
+        assertEquals("application/x-quattro-pro", 
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length);
+        assertContains("This is an example spreadsheet", content);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java
new file mode 100644
index 0000000..c3af274
--- /dev/null
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java
@@ -0,0 +1,56 @@
+/* Copyright 2016 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.wordperfect;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.StringWriter;
+
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.junit.Test;
+
+/**
+ * Junit test class for the {@link WordPerfectParser}.
+ * @author Pascal Essiembre
+ */
+public class WordPerfectTest extends TikaTest {
+
+    private Tika tika = new Tika();
+
+    @Test
+    public void testWordPerfectParser() throws Exception {
+        File file = getResourceAsFile("/test-documents/testWordPerfect.wpd");
+
+        Metadata metadata = new Metadata();
+        StringWriter writer = new StringWriter();
+        tika.getParser().parse(
+                new FileInputStream(file),
+                new WriteOutContentHandler(writer),
+                metadata,
+                new ParseContext());
+        String content = writer.toString();
+
+        assertEquals("application/vnd.wordperfect", 
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length);
+        assertContains("test test", content);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/test/resources/test-documents/testWordPerfect.wpd
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWordPerfect.wpd 
b/tika-parsers/src/test/resources/test-documents/testWordPerfect.wpd
new file mode 100644
index 0000000..4c6ae5d
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testWordPerfect.wpd differ

Reply via email to