Repository: tika Updated Branches: refs/heads/master 202f137b7 -> d011d708c
New WordPerfect and QuattroPro parsers for TIKA-1946 contributed by pascal.essiembre Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/87c2ef31 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/87c2ef31 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/87c2ef31 Branch: refs/heads/master Commit: 87c2ef3191d0a86502dc249240022b3cc973aaa4 Parents: 2dbd651 Author: Pascal Essiembre <[email protected]> Authored: Tue Dec 20 15:42:39 2016 -0500 Committer: Pascal Essiembre <[email protected]> Committed: Tue Dec 20 15:42:39 2016 -0500 ---------------------------------------------------------------------- .../org/apache/tika/mime/tika-mimetypes.xml | 35 +- .../parser/wordperfect/QPWTextExtractor.java | 223 ++++++++++ .../tika/parser/wordperfect/QuattroPro.java | 72 ++++ .../parser/wordperfect/QuattroProParser.java | 71 +++ .../tika/parser/wordperfect/WP6Constants.java | 432 +++++++++++++++++++ .../tika/parser/wordperfect/WP6FileHeader.java | 192 +++++++++ .../parser/wordperfect/WP6TextExtractor.java | 218 ++++++++++ .../tika/parser/wordperfect/WPInputStream.java | 196 +++++++++ .../tika/parser/wordperfect/WordPerfect.java | 69 +++ .../parser/wordperfect/WordPerfectParser.java | 74 ++++ .../services/org.apache.tika.parser.Parser | 2 + .../tika/parser/wordperfect/QuattroProTest.java | 58 +++ .../parser/wordperfect/WordPerfectTest.java | 56 +++ .../test-documents/testWordPerfect.wpd | Bin 0 -> 2044 bytes 14 files changed, 1694 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml ---------------------------------------------------------------------- diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 30068da..7afdb4d 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -2585,11 +2585,28 @@ <mime-type type="application/vnd.wmf.bootstrap"/> <mime-type type="application/vnd.wordperfect"> <alias type="application/wordperfect"/> + <alias type="application/wordperfect5.1"/> + <alias type="application/wordperfect6.0"/> + <alias type="application/wordperfect6.1"/> + <alias type="application/x-corel-wordperfect"/> + <acronym>WPD</acronym> + <_comment>WordPerfect - Corel Word Processing</_comment> + <tika:link>http://en.wikipedia.org/wiki/WordPerfect</tika:link> + <tika:uti>com.corel.wordperfect.doc</tika:uti> + <magic priority="60"> + <match value="0xFF575043" type="big32" offset="0"/> + </magic> <magic priority="50"> - <match value="0xFF575043" type="string" offset="0:3"/> <!-- ÿWPC --> + <match value="application/vnd.wordperfect;" type="string" offset="0"></match> </magic> <glob pattern="*.wpd"/> - </mime-type> + <glob pattern="*.wp"/> + <glob pattern="*.wp5"/> + <glob pattern="*.wp6"/> + <glob pattern="*.w60"/> + <glob pattern="*.wp61"/> + <glob pattern="*.wpt"/> + </mime-type> <mime-type type="application/vnd.wqd"> <glob pattern="*.wqd"/> </mime-type> @@ -3725,11 +3742,21 @@ </mime-type> <mime-type type="application/x-quattro-pro"> + <_comment> + Quattro Pro - Corel Spreadsheet (part of WordPerfect Office suite) + </_comment> + <!-- Conflicts with MS Word .doc format: + <magic priority="90"> + <match value="0xD0CF11E0A1B11AE1" type="string" offset="0"/> + </magic> + --> + <magic priority="50"> + <match value="0x00000200" type="big32" offset="0"/> + </magic> <glob pattern="*.qpw"/> - <glob pattern="*.wb1"/> + <glob pattern="*.wb1"/> <glob pattern="*.wb2"/> <glob pattern="*.wb3"/> - <sub-class-of type="application/x-tika-msoffice"/> </mime-type> <mime-type type="application/xquery"> http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java new file mode 100644 index 0000000..2242001 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java @@ -0,0 +1,223 @@ +/* Copyright 2015-2016 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.wordperfect; + +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Extracts text from a Quattro Pro document according to QPW v9 File Format. + * This format appears to be compatible with more recent versions too. + * @author Pascal Essiembre + */ +public class QPWTextExtractor { + + private static final Logger LOG = + LogManager.getLogger(QPWTextExtractor.class); + + private static final String OLE_DOCUMENT_NAME = "NativeContent_MAIN"; + + private enum Extractor{ + IGNORE { @Override public void extract(Context ctx) throws IOException { + ctx.in.skipWPByte(ctx.bodyLength); + }}, + BOF { @Override public void extract(Context ctx) throws IOException { + ctx.metadata.set(QuattroPro.ID, ctx.in.readWPString(4)); + ctx.metadata.set(QuattroPro.VERSION, ctx.in.readWPShort()); + ctx.metadata.set(QuattroPro.BUILD, ctx.in.readWPShort()); + ctx.in.readWPShort(); // Last saved bits + ctx.metadata.set(QuattroPro.LOWEST_VERSION, ctx.in.readWPShort()); + ctx.metadata.set(QuattroPro.PAGE_COUNT, ctx.in.readWPShort()); + ctx.in.skipWPByte(ctx.bodyLength - 14); + }}, + USER { @Override public void extract(Context ctx) throws IOException { + ctx.metadata.set(QuattroPro.CREATOR, getQstrLabel(ctx.in)); + ctx.metadata.set(QuattroPro.LAST_USER, getQstrLabel(ctx.in)); + }}, + EXT_LINK { @Override public void extract(Context ctx) + throws IOException, SAXException { + ctx.in.readWPShort(); // index + ctx.in.readWPShort(); // page first + ctx.in.readWPShort(); // page last + ctx.xhtml.characters(getQstrLabel(ctx.in)); + ctx.xhtml.characters(System.lineSeparator()); + }}, + STRING_TABLE { @Override public void extract(Context ctx) + throws IOException, SAXException { + long entries = ctx.in.readWPLong(); + ctx.in.readWPLong(); // Total used + ctx.in.readWPLong(); // Total saved + for (int i = 0; i < entries; i++) { + ctx.xhtml.characters(getQstrLabel(ctx.in)); + ctx.xhtml.characters(System.lineSeparator()); + } + }}, + BOS { @Override public void extract(Context ctx) + throws IOException, SAXException { + ctx.in.readWPShort(); // sheet # + ctx.in.readWPShort(); // first col index + ctx.in.readWPShort(); // last col index + ctx.in.readWPLong(); // first row index + ctx.in.readWPLong(); // last row index + ctx.in.readWPShort(); // format + ctx.in.readWPShort(); // flags + ctx.xhtml.characters(getQstrLabel(ctx.in)); + ctx.xhtml.characters(System.lineSeparator()); + }}, + SHEET_HEADFOOT { @Override public void extract(Context ctx) + throws IOException, SAXException { + ctx.in.readWPShort(); // flag + ctx.xhtml.characters(getQstrLabel(ctx.in)); + ctx.xhtml.characters(System.lineSeparator()); + }}, + FORMULA_STRING_VALUE { @Override public void extract(Context ctx) + throws IOException, SAXException { + ctx.in.readWPShort(); // column + ctx.in.readWPLong(); // row + ctx.xhtml.characters(getQstrLabel(ctx.in)); + }}, + CGENERICLABEL { @Override public void extract(Context ctx) + throws IOException, SAXException { + ctx.in.readWPShort(); // column + ctx.in.readWPLong(); // row + ctx.in.readWPShort(); // format index + ctx.xhtml.characters(getQstrLabel(ctx.in)); + }}, + CCOMMENT { @Override public void extract(Context ctx) + throws IOException, SAXException { + ctx.in.readWPShort(); // column + ctx.in.readWPLong(); // row + ctx.in.readWPLong(); // flag + ctx.xhtml.characters(getQstrLabel(ctx.in)); // author name + ctx.xhtml.characters(getQstrLabel(ctx.in)); // comment + }}, + // Use to print out a chunk + DEBUG { @Override public void extract(Context ctx) throws IOException { + LOG.error("REC (" + + Integer.toHexString(ctx.type) + "/" + ctx.bodyLength + + "):" + ctx.in.readWPString(ctx.bodyLength)); + }}; + public abstract void extract(Context ctx) + throws IOException, SAXException; + } + + // Holds extractors for each record types we are interested in. + // All record types not defined here will be skipped. + private static final Map<Integer, Extractor> EXTRACTORS = + new HashMap<Integer, Extractor>(); + static { + //--- Global Records --- + EXTRACTORS.put(0x0001, Extractor.BOF); // Beginning of file + EXTRACTORS.put(0x0005, Extractor.USER); // User + + //--- Notebook Records --- + EXTRACTORS.put(0x0403, Extractor.EXT_LINK);// External link + EXTRACTORS.put(0x0407, Extractor.STRING_TABLE); // String table + + //--- Sheet Records --- + EXTRACTORS.put(0x0601, Extractor.BOS); // Beginning of sheet + EXTRACTORS.put(0x0605, Extractor.SHEET_HEADFOOT); // Sheet header + EXTRACTORS.put(0x0606, Extractor.SHEET_HEADFOOT); // Sheet footer + + //--- Cells --- + EXTRACTORS.put(0x0c02, Extractor.FORMULA_STRING_VALUE); + EXTRACTORS.put(0x0c72, Extractor.CGENERICLABEL); + EXTRACTORS.put(0x0c80, Extractor.CCOMMENT); + } + + class Context { + private final WPInputStream in; + private final XHTMLContentHandler xhtml; + private final Metadata metadata; + private int type; + private int bodyLength; + public Context(WPInputStream in, XHTMLContentHandler xhtml, + Metadata metadata) { + super(); + this.in = in; + this.xhtml = xhtml; + this.metadata = metadata; + } + } + + @SuppressWarnings("resource") + public void extract( + InputStream input, XHTMLContentHandler xhtml, Metadata metadata) + throws IOException, SAXException { + + POIFSFileSystem pfs = new POIFSFileSystem(input); + DirectoryNode rootNode = pfs.getRoot(); + if (rootNode == null || !rootNode.hasEntry(OLE_DOCUMENT_NAME)) { + LOG.info("Unsupported QuattroPro file format. " + + "Looking for OLE entry \"" + OLE_DOCUMENT_NAME + + "\". Found: "+ rootNode.getEntryNames()); + return; + } + + //TODO shall we validate and throw warning/error if the file does not + //start with a BOF and ends with a EOF? + xhtml.startElement("p"); + try (WPInputStream in = new WPInputStream( + pfs.createDocumentInputStream(OLE_DOCUMENT_NAME))) { + Context ctx = new Context(in, xhtml, metadata); + while (hasNext(in)) { + ctx.type = in.readWPShort(); + ctx.bodyLength = in.readWPShort(); + Extractor extractor = EXTRACTORS.get(ctx.type); + if (extractor != null) { + extractor.extract(ctx); + } else { + // Use DEBUG to find out what we are ignoring + //Extractor.DEBUG.extract(ctx); + Extractor.IGNORE.extract(ctx); + } + } + } + xhtml.endElement("p"); + } + + private boolean hasNext(InputStream in) throws IOException { + try { + in.mark(1); + return in.read() != -1; + } finally { + in.reset(); + } + } + + private static String getQstrLabel(WPInputStream in) throws IOException { + // QSTR + int count = in.readWPShort(); + in.readWPByte(); // string type + char[] text = new char[count+1]; + text[0] = in.readWPChar(); + + // QSTRLABEL + for (int i = 0; i < count; i++) { + text[i+1] = in.readWPChar(); + } + return new String(text); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroPro.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroPro.java b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroPro.java new file mode 100644 index 0000000..8270f8d --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroPro.java @@ -0,0 +1,72 @@ +/* Copyright 2016 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.wordperfect; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; + +/** + * QuattroPro properties collection. + * @author Pascal Essiembre + */ +public interface QuattroPro { + public static final String QUATTROPRO_METADATA_NAME_PREFIX = "wordperfect"; + + public static final String META_CREATOR = "creator"; + public static final String META_LAST_USER = "last-user"; + + /** + * ID. + */ + Property ID = Property.internalText( + QUATTROPRO_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "Id"); + /** + * Version. + */ + Property VERSION = Property.internalInteger( + QUATTROPRO_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "Version"); + /** + * Build. + */ + Property BUILD = Property.internalInteger( + QUATTROPRO_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "Build"); + /** + * Lowest version. + */ + Property LOWEST_VERSION = Property.internalInteger( + QUATTROPRO_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "LowestVersion"); + /** + * Number of pages. + */ + Property PAGE_COUNT = Property.internalInteger( + QUATTROPRO_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "PageCount"); + /** + * Creator. + */ + Property CREATOR = Property.internalText( + QUATTROPRO_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "Creator"); + /** + * Last User. + */ + Property LAST_USER = Property.internalText( + QUATTROPRO_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastUser"); +} http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroProParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroProParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroProParser.java new file mode 100644 index 0000000..735486f --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/QuattroProParser.java @@ -0,0 +1,71 @@ +/* Copyright 2016 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.wordperfect; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * <p>Parser for Corel QuattroPro documents (part of Corel WordPerfect + * Office Suite). + * Targets QPW v9 File Format + * but appears to be compatible with more recent versions too.</p> + * @author Pascal Essiembre + */ +public class QuattroProParser extends AbstractParser { + + private static final long serialVersionUID = 8941810225917012232L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("x-quattro-pro")))); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + if (metadata.get(Metadata.CONTENT_TYPE) == null) { + metadata.set(Metadata.CONTENT_TYPE, "application/x-quattro-pro"); + } + + XHTMLContentHandler xhtml = + new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + QPWTextExtractor extractor = new QPWTextExtractor(); + extractor.extract(stream, xhtml, metadata); + + xhtml.endDocument(); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java new file mode 100644 index 0000000..f17837c --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6Constants.java @@ -0,0 +1,432 @@ +/* Copyright 2015-2016 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.wordperfect; + +/** + * WordPerfect constant values used for parsing and extracting text. + * @author Pascal Essiembre + */ +public final class WP6Constants { + + + public static final String WP6_FILE_ID = "ÿWPC"; + public static final int WP6_PRODUCT_TYPE = 1; + public static final int WP6_FILE_TYPE_WP_DOCUMENT = 10; + public static final int WP6_FILE_TYPE_WPD = 36; + + public static final char[] DEFAULT_EXTENDED_INTL_CHARS = new char[] { + '\0', '\u00E5', '\u00C5', '\u00E6', '\u00C6', + '\u00E4', '\u00C4', '\u00E1', '\u00E0', '\u00E2', + '\u00E3', '\u00C3', '\u00E7', '\u00C7', '\u00EB', + '\u00E9', '\u00C9', '\u00E8', '\u00EA', '\u00ED', + '\u00F1', '\u00D1', '\u00F8', '\u00D8', '\u00F5', + '\u00D5', '\u00F6', '\u00D6', '\u00FC', '\u00DC', + '\u00FA', '\u00F9', '\u00DF', + }; + + /** + * Extended character sets used when fixed-length multi-byte functions + * with a byte value of 240 (0xF0) are found in a WordPerfect document. + * Those character set codes may be specific to WordPerfect + * file specifications and may or may not be considered standard + * outside WordPerfect. + */ + public static final char[][] EXTENDED_CHARSETS = new char[][] { + // WP Charset 0: ASCII (95 chars) + { + ' ', '"', '#', '$', '%', '&', '\'', '(', ')', '*', + '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', + '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', + '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', + 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', + 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', + ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', + 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', + 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + '{', '|', '}', '~', '\u00A0' + }, + // WP Charset 1: Multinational (242 chars) + { + '\u0300','\u00b7','\u0303','\u0302','\u0335','\u0338','\u0301','\u0308', + '\u0304','\u0313','\u0315','\u02bc','\u0326','\u0315','\u00b0','\u0307', + '\u030b','\u0327','\u0328','\u030c','\u0337','\u0305','\u0306','\u00df', + '\u0138','\u006a','\u00c1','\u00e1','\u00c2','\u00e2','\u00c4','\u00e4', + '\u00c0','\u00e0','\u00c5','\u00e5','\u00c6','\u00e6','\u00c7','\u00e7', + '\u00c9','\u00e9','\u00ca','\u00ea','\u00cb','\u00eb','\u00c8','\u00e8', + '\u00cd','\u00ed','\u00ce','\u00ee','\u00cf','\u00ef','\u00cc','\u00ec', + '\u00d1','\u00f1','\u00d3','\u00f3','\u00d4','\u00f4','\u00d6','\u00f6', + '\u00d2','\u00f2','\u00da','\u00fa','\u00db','\u00fb','\u00dc','\u00fc', + '\u00d9','\u00f9','\u0178','\u00ff','\u00c3','\u00e3','\u0110','\u0111', + '\u00d8','\u00f8','\u00d5','\u00f5','\u00dd','\u00fd','\u00d0','\u00f0', + '\u00de','\u00fe','\u0102','\u0103','\u0100','\u0101','\u0104','\u0105', + '\u0106','\u0107','\u010c','\u010d','\u0108','\u0109','\u010a','\u010b', + '\u010e','\u010f','\u011a','\u011b','\u0116','\u0117','\u0112','\u0113', + '\u0118','\u0119','\u01f4','\u01f5','\u011e','\u011f','\u01e6','\u01e7', + '\u0122','\u0123','\u011c','\u011d','\u0120','\u0121','\u0124','\u0125', + '\u0126','\u0127','\u0130','\u0069','\u012a','\u012b','\u012e','\u012f', + '\u0128','\u0129','\u0132','\u0133','\u0134','\u0135','\u0136','\u0137', + '\u0139','\u013a','\u013d','\u013e','\u013b','\u013c','\u013f','\u0140', + '\u0141','\u0142','\u0143','\u0144','\u0000','\u0149','\u0147','\u0148', + '\u0145','\u0146','\u0150','\u0151','\u014c','\u014d','\u0152','\u0153', + '\u0154','\u0155','\u0158','\u0159','\u0156','\u0157','\u015a','\u015b', + '\u0160','\u0161','\u015e','\u015f','\u015c','\u015d','\u0164','\u0165', + '\u0162','\u0163','\u0166','\u0167','\u016c','\u016d','\u0170','\u0171', + '\u016a','\u016b','\u0172','\u0173','\u016e','\u016f','\u0168','\u0169', + '\u0174','\u0175','\u0176','\u0177','\u0179','\u017a','\u017d','\u017e', + '\u017b','\u017c','\u014a','\u014b','\u0000','\u0000','\u0000','\u0000', + '\u0000','\u0000','\u0000','\u0000','\u0000','\u0000','\u0000','\u0000', + '\u0000','\u0000','\u1ef2','\u1ef3','\u010e','\u010f','\u01a0','\u01a1', + '\u01af','\u01b0','\u0114','\u0115','\u012c','\u012d','\u0049','\u0131', + '\u014e','\u014f' + }, + // WP Charset 2: Phonetic (145 chars) + { + '\u02b9','\u02ba','\u02bb','\u0020','\u02bd','\u02bc','\u0020','\u02be', + '\u02bf','\u0310','\u02d0','\u02d1','\u0306','\u032e','\u0329','\u02c8', + '\u02cc','\u02c9','\u02ca','\u02cb','\u02cd','\u02ce','\u02cf','\u02c6', + '\u02c7','\u02dc','\u0325','\u02da','\u032d','\u032c','\u0323','\u0308', + '\u0324','\u031c','\u031d','\u031e','\u031f','\u0320','\u0321','\u0322', + '\u032a','\u032b','\u02d2','\u02d3','\u0361','\u0356','\u005f','\u2017', + '\u033e','\u02db','\u0327','\u0233','\u030d','\u02b0','\u02b6','\u0250', + '\u0251','\u0252','\u0253','\u0299','\u0254','\u0255','\u0297','\u0256', + '\u0257','\u0258','\u0259','\u025a','\u025b','\u025c','\u025d','\u029a', + '\u025e','\u025f','\u0278','\u0261','\u0260','\u0262','\u029b','\u0263', + '\u0264','\u0265','\u0266','\u0267','\u029c','\u0268','\u026a','\u0269', + '\u029d','\u029e','\u026b','\u026c','\u026d','\u029f','\u026e','\u028e', + '\u026f','\u0270','\u0271','\u0272','\u0273','\u0274','\u0276','\u0277', + '\u02a0','\u0279','\u027a','\u027b','\u027c','\u027d','\u027e','\u027f', + '\u0280','\u0281','\u0282','\u0283','\u0284','\u0285','\u0286','\u0287', + '\u0288','\u0275','\u0289','\u028a','\u028c','\u028b','\u028d','\u03c7', + '\u028f','\u0290','\u0291','\u0292','\u0293','\u0294','\u0295','\u0296', + '\u02a1','\u02a2','\u0298','\u02a3','\u02a4','\u02a5','\u02a6','\u02a7', + '\u02a8' + }, + // WP Charset 3: Box Drawing (88 chars) + { + '\u2591','\u2592','\u2593','\u2588','\u258c','\u2580','\u2590','\u2584', + '\u2500','\u2502','\u250c','\u2510','\u2518','\u2514','\u251c','\u252c', + '\u2524','\u2534','\u253c','\u2550','\u2551','\u2554','\u2557','\u255d', + '\u255a','\u2560','\u2566','\u2563','\u2569','\u256c','\u2552','\u2555', + '\u255b','\u2558','\u2553','\u2556','\u255c','\u2559','\u255e','\u2565', + '\u2561','\u2568','\u255f','\u2564','\u2562','\u2567','\u256b','\u256a', + '\u2574','\u2575','\u2576','\u2577','\u2578','\u2579','\u257a','\u257b', + '\u257c','\u257e','\u257d','\u257f','\u251f','\u2522','\u251e','\u2521', + '\u252e','\u2532','\u252d','\u2531','\u2527','\u2526','\u252a','\u2529', + '\u2536','\u253a','\u2535','\u2539','\u2541','\u2546','\u253e','\u2540', + '\u2544','\u254a','\u253d','\u2545','\u2548','\u2543','\u2549','\u2547' + }, + // WP Charset 4: Typographic Symbols (102 chars) + { + '\u25cf','\u25cb','\u25a0','\u2022','\u002a','\u00b6','\u00a7','\u00a1', + '\u00bf','\u00ab','\u00bb','\u00a3','\u00a5','\u20a7','\u0192','\u00aa', + '\u00ba','\u00bd','\u00bc','\u00a2','\u00b2','\u207f','\u00ae','\u00a9', + '\u00a4','\u00be','\u00b3','\u201b','\u2019','\u2018','\u201f','\u201d', + '\u201c','\u2013','\u2014','\u2039','\u203a','\u25cb','\u25a1','\u2020', + '\u2021','\u2122','\u2120','\u211e','\u25cf','\u25e6','\u25a0','\u25aa', + '\u25a1','\u25ab','\u2012','\ufb00','\ufb03','\ufb04','\ufb01','\ufb02', + '\u2026','\u0024','\u20a3','\u20a2','\u20a0','\u20a4','\u201a','\u201e', + '\u2153','\u2154','\u215b','\u215c','\u215d','\u215e','\u24c2','\u24c5', + '\u20ac','\u2105','\u2106','\u2030','\u2116','\u2014','\u00b9','\u2409', + '\u240c','\u240d','\u240a','\u2424','\u240b','\u267c','\u20a9','\u20a6', + '\u20a8','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u0020','\u0020','\u0020','\u0020','\u1d11','\u1d12' + // last two uncertain + }, + // WP Charset 5: Iconic Symbol (255 chars) + { + '\u2661','\u2662','\u2667','\u2664','\u2642','\u2640','\u263c','\u263a', + '\u263b','\u266a','\u266c','\u25ac','\u2302','\u203c','\u221a','\u21a8', + '\u2310','\u2319','\u25d8','\u25d9','\u21b5','\u2104','\u261c','\u23b5', + '\u2610','\u2612','\u2639','\u266f','\u266d','\u266e','\u260e','\u231a', + '\u231b','\u2701','\u2702','\u2703','\u2704','\u260e','\u2706','\u2707', + '\u2708','\u2709','\u261b','\u261e','\u270c','\u270d','\u270e','\u270f', + '\u2710','\u2711','\u2712','\u2713','\u2714','\u2715','\u2716','\u2717', + '\u2718','\u2719','\u271a','\u271b','\u271c','\u271d','\u271e','\u271f', + '\u2720','\u2721','\u2722','\u2723','\u2724','\u2725','\u2726','\u2727', + '\u2605','\u2606','\u272a','\u272b','\u272c','\u272d','\u272e','\u272f', + '\u2730','\u2731','\u2732','\u2733','\u2734','\u2735','\u2736','\u2737', + '\u2738','\u2739','\u273a','\u273b','\u273c','\u273d','\u273e','\u273f', + '\u2740','\u2741','\u2742','\u2743','\u2744','\u2745','\u2746','\u2747', + '\u2748','\u2749','\u274a','\u274b','\u25cf','\u274d','\u25a0','\u274f', + '\u2750','\u2751','\u2752','\u25b2','\u25bc','\u25c6','\u2756','\u25d7', + '\u2758','\u2759','\u275a','\u275b','\u275c','\u275d','\u275e','\u2036', + '\u2033','\u0020','\u0020','\u0020','\u0020','\u2329','\u232a','\u005b', + '\u005d','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u2190','\u0020','\u0020','\u0020','\u0020','\u0020','\u21e8','\u21e6', + '\u2794','\u0020','\u0020','\u0020','\u0020','\u0020','\u25d6','\u0020', + '\u0020','\u2761','\u2762','\u2763','\u2764','\u2765','\u2766','\u2767', + '\u2663','\u2666','\u2665','\u2660','\u2780','\u2781','\u2782','\u2783', + '\u2784','\u2785','\u2786','\u2787','\u2788','\u2789','\u2776','\u2777', + '\u2778','\u2779','\u277a','\u277b','\u277c','\u277d','\u277e','\u277f', + '\u2780','\u2781','\u2782','\u2783','\u2784','\u2785','\u2786','\u2787', + '\u2788','\u2789','\u278a','\u278b','\u278c','\u278d','\u278e','\u278f', + '\u2790','\u2791','\u2792','\u2793','\u2794','\u2192','\u2194','\u2195', + '\u2798','\u2799','\u279a','\u279b','\u279c','\u279d','\u279e','\u279f', + '\u27a0','\u27a1','\u27a2','\u27a3','\u27a4','\u27a5','\u27a6','\u27a7', + '\u27a8','\u27a9','\u27aa','\u27ab','\u27ac','\u27ad','\u27ae','\u27af', + '\u0020','\u27b1','\u27b2','\u27b3','\u27b4','\u27b5','\u27b6','\u27b7', + '\u27b8','\u27b9','\u27ba','\u27bb','\u27bc','\u27bd','\u27be' + }, + // WP Charset 6: Math/Scientific (238 chars) + { + '\u2212','\u00b1','\u2264','\u2265','\u221d','\u002f','\u2215','\u2216', + '\u00f7','\u2223','\u27e8','\u27e9','\u223c','\u2248','\u2261','\u2208', + '\u2229','\u2225','\u2211','\u221e','\u00ac','\u2192','\u2190','\u2191', + '\u2193','\u2194','\u2195','\u25b8','\u25c2','\u25b4','\u25be','\u22c5', + '\u00b7','\u2218','\u2219','\u212b','\u00b0','\u00b5','\u203e','\u00d7', + '\u222b','\u220f','\u2213','\u2207','\u2202','\u2032','\u2033','\u2192', + '\u212f','\u2113','\u210f','\u2111','\u211c','\u2118','\u21c4','\u21c6', + '\u21d2','\u21d0','\u21d1','\u21d3','\u21d4','\u21d5','\u2197','\u2198', + '\u2196','\u2199','\u222a','\u2282','\u2283','\u2286','\u2287','\u220d', + '\u2205','\u2308','\u2309','\u230a','\u230b','\u226a','\u226b','\u2220', + '\u2297','\u2295','\u2296','\u2a38','\u2299','\u2227','\u2228','\u22bb', + '\u22a4','\u22a5','\u2312','\u22a2','\u22a3','\u25a1','\u25a0','\u25ca', + '\u25c6','\u27e6','\u27e7','\u2260','\u2262','\u2235','\u2234','\u2237', + '\u222e','\u2112','\u212d','\u2128','\u2118','\u20dd','\u29cb','\u25c7', + '\u22c6','\u2034','\u2210','\u2243','\u2245','\u227a','\u227c','\u227b', + '\u227d','\u2203','\u2200','\u22d8','\u22d9','\u228e','\u228a','\u228b', + '\u2293','\u2294','\u228f','\u2291','\u22e4','\u2290','\u2292','\u22e5', + '\u25b3','\u25bd','\u25c3','\u25b9','\u22c8','\u2323','\u2322','\u25ef', + '\u219d','\u21a9','\u21aa','\u21a3','\u21bc','\u21bd','\u21c0','\u21c1', + '\u21cc','\u21cb','\u21bf','\u21be','\u21c3','\u21c2','\u21c9','\u21c7', + '\u22d3','\u22d2','\u22d0','\u22d1','\u229a','\u229b','\u229d','\u2127', + '\u2221','\u2222','\u25c3','\u25b9','\u25b5','\u25bf','\u2214','\u2250', + '\u2252','\u2253','\u224e','\u224d','\u22a8','\u2258','\u226c','\u0285', + '\u2605','\u226e','\u2270','\u226f','\u2271','\u2241','\u2244','\u2247', + '\u2249','\u2280','\u22e0','\u2281','\u22e1','\u2284','\u2285','\u2288', + '\u2289','\u0020','\u0020','\u22e2','\u22e3','\u2226','\u2224','\u226d', + '\u2204','\u2209','\u2247','\u2130','\u2131','\u2102','\u0020','\u2115', + '\u211d','\u225f','\u22be','\u220b','\u22ef','\u2026','\u22ee','\u22f1', + '\u0020','\u20e1','\u002b','\u002d','\u003d','\u002a','\u2032','\u2033', + '\u2034','\u210b','\u2118','\u2272','\u2273','\u0020' + }, + // WP Charset 7 Math/Scientific Extended (229 chars) + { + '\u2320','\u2321','\u23a5','\u23bd','\u221a','\u0020','\u2211','\u220f', + '\u2210','\u222b','\u222e','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u0020','\u23a7','\u23a8','\u23a9','\u23aa','\u0020','\u0020','\u0020', + '\u0020','\u23ab','\u23ac','\u23ad','\u23aa','\u0020','\u0020','\u0020', + '\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u0020','\u0020','\u0020','\u0020','\u0020','\u222a','\u222b','\u0020', + '\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u239b','\u239d','\u239c','\u0020','\u0020','\u0020','\u0020','\u239e', + '\u23a8','\u239f','\u0020','\u0020','\u0020','\u0020','\u23a1','\u23a3', + '\u23a2','\u0020','\u20aa','\u0020','\u0020','\u23a4','\u23a6','\u23a5', + '\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u22c3','\u22c2','\u228e','\u2a04','\u2294','\u2a06','\u2227','\u22c0', + '\u2228','\u22c1','\u2297','\u2a02','\u2295','\u2a01','\u2299','\u2a00', + '\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u0020','\u0020','\u0020','\u0020','\u229d','\u0020','\u2238','\u0020', + '\u27e6','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u27e7', + '\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u21bc','\u21bd','\u0020','\u296c','\u296d','\u296a','\u296b','\u0020', + '\u21c9','\u21c7','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u21be','\u21bf','\u21c3','\u21c2','\u0020','\u2293','\u2a05','\u23a1', + '\u0020','\u0020','\u0020','\u0020','\u0020' + }, + // WP Charset 8: Greek (219 chars) + { + '\u0391','\u03b1','\u0392','\u03b2','\u0392','\u03d0','\u0393','\u03b3', + '\u0394','\u03b4','\u0395','\u03b5','\u0396','\u03b6','\u0397','\u03b7', + '\u0398','\u03b8','\u0399','\u03b9','\u039a','\u03ba','\u039b','\u03bb', + '\u039c','\u03bc','\u039d','\u03bd','\u039e','\u03be','\u039f','\u03bf', + '\u03a0','\u03c0','\u03a1','\u03c1','\u03a3','\u03c3','\u03a3','\u03c2', + '\u03a4','\u03c4','\u03a5','\u03c5','\u03a6','\u03c6','\u03a7','\u03c7', + '\u03a8','\u03c8','\u03a9','\u03c9','\u0386','\u03ac','\u0388','\u03ad', + '\u0389','\u03ae','\u038a','\u03af','\u03aa','\u03ca','\u038c','\u03cc', + '\u038e','\u03cd','\u03ab','\u03cb','\u038f','\u03ce','\u03b5','\u03d1', + '\u03f0','\u03d6','\u03f1','\u03c2','\u03d2','\u03d5','\u03c9','\u037e', + '\u0387','\u0374','\u0375','\u0384','\u00a8','\u0385','\u1fed','\u1fef', + '\u1fc0','\u1fbd','\u1ffe','\u037a','\u1fce','\u1fde','\u1fcd','\u1fdd', + '\u1fcf','\u1fdf','\u0384','\u1fef','\u1fc0','\u1fbd','\u1ffe','\u1fce', + '\u1fde','\u1fcd','\u1fdd','\u1fcf','\u1fdf','\u1f70','\u1fb6','\u1fb3', + '\u1fb4','\u1fb2','\u1fb7','\u1f00','\u1f04','\u1f02','\u1f06','\u1f80', + '\u1f84','\u1f82','\u1f86','\u1f01','\u1f05','\u1f03','\u1f07','\u1f81', + '\u1f85','\u1f83','\u1f87','\u1f72','\u1f10','\u1f14','\u1f12','\u1f11', + '\u1f15','\u1f13','\u1f74','\u1fc6','\u1fc3','\u1fc4','\u1fc2','\u1fc7', + '\u1f20','\u1f24','\u1f22','\u1f26','\u1f90','\u1f94','\u1f92','\u1f96', + '\u1f21','\u1f25','\u1f23','\u1f27','\u1f91','\u1f95','\u1f93','\u1f97', + '\u1f76','\u1fd6','\u0390','\u1fd2','\u1f30','\u1f34','\u1f32','\u1f36', + '\u1f31','\u1f35','\u1f33','\u1f37','\u1f78','\u1f40','\u1f44','\u1f42', + '\u1f41','\u1f45','\u1f43','\u1fe5','\u1fe4','\u1f7a','\u1fe6','\u03b0', + '\u1fe2','\u1f50','\u1f54','\u1f52','\u1f56','\u1f51','\u1f55','\u1f53', + '\u1f57','\u1f7c','\u1ff6','\u1ff3','\u1ff4','\u1ff2','\u1ff7','\u1f60', + '\u1f64','\u1f62','\u1f66','\u1fa0','\u1fa4','\u1fa2','\u1fa6','\u1f61', + '\u1f65','\u1f63','\u1f67','\u1fa1','\u1fa5','\u1fa3','\u1fa7','\u03da', + '\u03dc','\u03de','\u03e0' + }, + // WP Charset 9: Hebrew (123 chars) + { + '\u05d0','\u05d1','\u05d2','\u05d3','\u05d4','\u05d5','\u05d6','\u05d7', + '\u05d8','\u05d9','\u05da','\u05db','\u05dc','\u05dd','\u05de','\u05df', + '\u05e0','\u05e1','\u05e2','\u05e3','\u05e4','\u05e5','\u05e6','\u05e7', + '\u05e8','\u05e9','\u05ea','\u05be','\u05c0','\u05c3','\u05f3','\u05f4', + '\u05b0','\u05b1','\u05b2','\u05b3','\u05b4','\u05b5','\u05b6','\u05b7', + '\u05b8','\u05b9','\u05b9','\u05bb','\u05bc','\u05bd','\u05bf','\u05b7', + '\ufb1e','\u05f0','\u05f1','\u05f2','\ufb1f','\u0591','\u0596','\u0020', + '\u05a4','\u059a','\u059b','\u05a3','\u05a5','\u05a6','\u05a7','\u05a2', + '\u0592','\u0593','\u0594','\u0595','\u0597','\u0598','\u0599','\u05a8', + '\u05f3','\u05f3','\u05f4','\u0020','\u05a9','\u05a0','\u059f','\u05ab', + '\u05ac','\u05af','\u05c4','\u05aa','\ufb30','\ufb31','\u05d1','\ufb32', + '\ufb33','\ufb34','\ufb35','\ufb4b','\ufb36','\u05d7','\ufb38','\ufb39', + '\ufb1d','\ufb3b','\ufb3a','\u05da','\u05da','\u05da','\u05da','\u05da', + '\u05da','\ufb3c','\ufb3e','\ufb40','\u05d5','\ufb41','\ufb44','\u05e4', + '\ufb46','\ufb47','\ufb2b','\ufb2d','\ufb2a','\ufb2c','\ufb4a','\u05dc', + '\ufb3c','\ufb49','\u20aa' + }, + // WP Charset 10: Cyrillic (250 chars) + { + '\u0410','\u0430','\u0411','\u0431','\u0412','\u0432','\u0413','\u0433', + '\u0414','\u0434','\u0415','\u0435','\u0401','\u0451','\u0416','\u0436', + '\u0417','\u0437','\u0418','\u0438','\u0419','\u0439','\u041a','\u043a', + '\u041b','\u043b','\u041c','\u043c','\u041d','\u043d','\u041e','\u043e', + '\u041f','\u043f','\u0420','\u0440','\u0421','\u0441','\u0422','\u0442', + '\u0423','\u0443','\u0424','\u0444','\u0425','\u0445','\u0426','\u0446', + '\u0427','\u0447','\u0428','\u0448','\u0429','\u0449','\u042a','\u044a', + '\u042b','\u044b','\u042c','\u044c','\u042d','\u044d','\u042e','\u044e', + '\u042f','\u044f','\u04d8','\u04d9','\u0403','\u0453','\u0490','\u0491', + '\u0492','\u0493','\u0402','\u0452','\u0404','\u0454','\u0404','\u0454', + '\u0496','\u0497','\u0405','\u0455','\u0020','\u0020','\u0418','\u0438', + '\u0406','\u0456','\u0407','\u0457','\u0020','\u0020','\u0408','\u0458', + '\u040c','\u045c','\u049a','\u049b','\u04c3','\u04c4','\u049c','\u049d', + '\u0409','\u0459','\u04a2','\u04a3','\u040a','\u045a','\u047a','\u047b', + '\u0460','\u0461','\u040b','\u045b','\u040e','\u045e','\u04ee','\u04ef', + '\u04ae','\u04af','\u04b0','\u04b1','\u0194','\u0263','\u04b2','\u04b3', + '\u0425','\u0445','\u04ba','\u04bb','\u047e','\u047f','\u040f','\u045f', + '\u04b6','\u04b7','\u04b8','\u04b9','\u0428','\u0448','\u0462','\u0463', + '\u0466','\u0467','\u046a','\u046b','\u046e','\u046f','\u0470','\u0471', + '\u0472','\u0473','\u0474','\u0475','\u0410','\u0430','\u0415','\u0435', + '\u0404','\u0454','\u0418','\u0438','\u0406','\u0456','\u0407','\u0457', + '\u041e','\u043e','\u0423','\u0443','\u042b','\u044b','\u042d','\u044d', + '\u042e','\u044e','\u042f','\u044f','\u0410','\u0430','\u0400','\u0450', + '\u0401','\u0451','\u040d','\u045d','\u041e','\u043e','\u0423','\u0443', + '\u042b','\u044b','\u042d','\u044d','\u042e','\u044e','\u042f','\u044f', + '\u0301','\u0300','\u0308','\u0306','\u0326','\u0328','\u0304','\u0020', + '\u201e','\u201c','\u10d0','\u10d1','\u10d2','\u10d3','\u10d4','\u10d5', + '\u10d6','\u10f1','\u10d7','\u10d8','\u10d9','\u10da','\u10db','\u10dc', + '\u10f2','\u10dd','\u10de','\u10df','\u10e0','\u10e1','\u10e2','\u10e3', + '\u10f3','\u10e4','\u10e5','\u10e6','\u10e7','\u10e8','\u10e9','\u10ea', + '\u10eb','\u10ec','\u10ed','\u10ee','\u10f4','\u10ef','\u10f0','\u10f5', + '\u10f6','\u10e3' + }, + // WP Charset 11: Japanese (63 chars) + { + '\uff61','\uff62','\uff63','\uff64','\uff65','\uff66','\uff67','\uff68', + '\uff69','\uff6a','\uff6b','\uff6c','\uff6d','\uff6e','\uff6f','\uff70', + '\uff71','\uff72','\uff73','\uff74','\uff75','\uff76','\uff77','\uff78', + '\uff79','\uff7a','\uff7b','\uff7c','\uff7d','\uff7e','\uff7f','\uff80', + '\uff81','\uff82','\uff83','\uff84','\uff85','\uff86','\uff87','\uff88', + '\uff89','\uff8a','\uff8b','\uff8c','\uff8d','\uff8e','\uff8f','\uff90', + '\uff91','\uff92','\uff93','\uff94','\uff95','\uff96','\uff97','\uff98', + '\uff99','\uff9a','\uff9b','\uff9c','\uff9d','\uff9e','\uff9f' + }, + // WP Charset 12: Current Font Symbols (256 chars) + { //TODO implement Current Font Symbols + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', + ' ' + }, + // WP Charset 13: Arabic (196 chars) + { + '\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u0020','\u0020','\u064e','\ufe77','\u064f','\ufe79','\u0650','\ufe7b', + '\u064b','\u064c','\u064c','\u064d','\u0652','\ufe7f','\u0651','\ufe7d', + '\ufc60','\ufcf2','\ufc61','\ufcf3','\ufc62','\ufcf4','\u064b','\ufc5e', + '\ufc5e','\ufc5f','\u0653','\u0670','\u0654','\u0020','\u060c','\u061b', + '\u061f','\u066d','\u066a','\u00bb','\u00ab','\u0029','\u0028','\u0661', + '\u0662','\u0663','\u0664','\u0665','\u0666','\u0667','\u0668','\u0669', + '\u0660','\u0662','\u0627','\ufe8e','\u0628','\ufe91','\ufe92','\ufe90', + '\u062a','\ufe97','\ufe98','\ufe96','\u062b','\ufe9b','\ufe9c','\ufe9a', + '\u062c','\ufe9f','\ufea0','\ufe9e','\u062d','\ufea3','\ufea4','\ufea2', + '\u062e','\ufea7','\ufea8','\ufea6','\u062f','\ufeaa','\u0630','\ufeac', + '\u0631','\ufeae','\u0632','\ufeaf','\u0633','\ufeb3','\ufeb4','\ufeb2', + '\u0634','\ufeb7','\ufeb8','\ufeb6','\u0635','\ufebb','\ufebc','\ufeba', + '\u0636','\ufebf','\ufec0','\ufebe','\u0637','\ufec3','\ufec4','\ufec2', + '\u0638','\ufec7','\ufec8','\ufec6','\u0639','\ufecb','\ufecc','\ufeca', + '\u063a','\ufecf','\ufed0','\ufece','\u0641','\ufed3','\ufed4','\ufed2', + '\u0642','\ufed7','\ufed8','\ufed6','\u0643','\ufedb','\ufedc','\ufeda', + '\u0644','\ufedf','\ufee0','\ufede','\u0645','\ufee3','\ufee4','\ufee2', + '\u0646','\ufee7','\ufee8','\ufee6','\u0647','\ufeeb','\ufeec','\ufeea', + '\u0629','\ufe94','\u0648','\ufeee','\u064a','\ufef3','\ufef4','\ufef2', + '\u0649','\ufef3','\ufef4','\ufef0','\u0621','\u0623','\ufe84','\u0625', + '\ufe88','\u0624','\ufe86','\u0626','\ufe8b','\ufe8c','\ufe8a','\ufd3d', + '\ufd3c','\u0622','\ufe82','\u0671','\ufb51','\ufefb','\ufefc','\ufef7', + '\ufef8','\ufef9','\ufefa','\u0020','\ufefc','\ufef5','\ufef6','\u0020', + '\u0020','\ufdf2','\u0640','\u0640' + }, + // WP Charset 14: Arabic Script (220 chars) + { + '\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0615', + '\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u0615','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020','\u0020', + '\u0020','\u06d4','\u0020','\u0020','\u00b0','\u0020','\u065a','\u0020', + '\u065a','\u0659','\u0020','\u0020','\u0654','\u064c','\ufc5e','\u065a', + '\u065a','\u06f4','\u06f4','\u06f5','\u06f6','\u06f6','\u06f7','\u06f8', + '\u067b','\ufb54','\ufb55','\ufb53','\u0680','\ufb5c','\ufb5d','\ufb5b', + '\u067e','\ufb58','\ufb59','\ufb57','\u0679','\ufb68','\ufb69','\ufb67', + '\u067c','\u067c','\u067c','\u067c','\u067f','\ufb64','\ufb65','\ufb63', + '\u067d','\u067d','\u067d','\u067d','\u067a','\ufb60','\ufb61','\ufb5f', + '\u0684','\ufb74','\ufb75','\ufb73','\u0683','\ufb78','\ufb79','\ufb77', + '\u0686','\ufb7c','\ufb7d','\ufb7b','\u0687','\ufb80','\ufb81','\ufb7f', + '\u0685','\u0685','\u0685','\u0685','\u0681','\u0681','\u0681','\u0681', + '\u0688','\ufb89','\u0689','\u0689','\u068c','\ufb85','\u068e','\ufb87', + '\u068a','\u068a','\u068d','\ufb83','\u0693','\u0693','\u0691','\ufb8d', + '\u0699','\u0699','\u0695','\u0695','\u0692','\u0692','\u0698','\ufb8b', + '\u0696','\u0696','\u075b','\u075b','\u069a','\u069a','\u069a','\u069a', + '\u06a0','\u06a0','\u06a0','\u06a0','\u06a4','\ufb6c','\ufb6d','\ufb6b', + '\u06a6','\ufb70','\ufb71','\ufb6f','\u06a9','\ufb90','\ufb91','\ufb8f', + '\u0643','\ufedb','\ufedc','\ufeda','\u06aa','\u06aa','\u06aa','\u06aa', + '\u06af','\ufb94','\ufb95','\ufb93','\u06af','\ufb94','\ufb95','\ufb93', + '\u06ab','\u06ab','\u06ab','\u06ab','\u06b1','\ufb9c','\ufb9d','\ufb9b', + '\u06b3','\ufb98','\ufb99','\ufb97','\u06b5','\u06b5','\u06b5','\u06b5', + '\u0020','\u0020','\u06ba','\u0020','\u0020','\ufb9f','\u06bc','\u06bc', + '\u06bc','\u06bc','\u06bb','\ufba2','\ufba3','\ufba1','\u06c6','\ufbda', + '\u0020','\u0020','\u06ca','\u06ca','\u06c1','\ufba8','\ufba9','\ufba7', + '\u06ce','\u06ce','\u06ce','\u06ce','\u06d2','\ufbaf','\u06d1','\u06d1', + '\u06d1','\u06d1','\u06c0','\ufba5' + }, + }; + + + /** + * Constructor. + */ + private WP6Constants() { + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java new file mode 100644 index 0000000..4b81256 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6FileHeader.java @@ -0,0 +1,192 @@ +/* Copyright 2015-2016 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.wordperfect; + +import org.apache.commons.lang.builder.ToStringBuilder; + +/** + * WordPerfect file header. + * @author Pascal Essiembre + */ +public class WP6FileHeader { + + // Normal header + private String fileId; + private long docAreaPointer; + private int productType; + private int fileType; + private int majorVersion; + private int minorVersion; + private boolean encrypted; + private int indexAreaPointer; + + // Extended header + private long fileSize; + + public WP6FileHeader() { + super(); + } + + public String getFileId() { + return fileId; + } + + public void setFileId(String fileId) { + this.fileId = fileId; + } + + public long getDocAreaPointer() { + return docAreaPointer; + } + + public void setDocAreaPointer(long docAreaPointer) { + this.docAreaPointer = docAreaPointer; + } + + public int getProductType() { + return productType; + } + + public void setProductType(int productType) { + this.productType = productType; + } + + public int getFileType() { + return fileType; + } + + public void setFileType(int fileType) { + this.fileType = fileType; + } + + public int getMajorVersion() { + return majorVersion; + } + + public void setMajorVersion(int majorVersion) { + this.majorVersion = majorVersion; + } + + public int getMinorVersion() { + return minorVersion; + } + + public void setMinorVersion(int minorVersion) { + this.minorVersion = minorVersion; + } + + public boolean isEncrypted() { + return encrypted; + } + + public void setEncrypted(boolean encrypted) { + this.encrypted = encrypted; + } + + public int getIndexAreaPointer() { + return indexAreaPointer; + } + + public void setIndexAreaPointer(int indexAreaPointer) { + this.indexAreaPointer = indexAreaPointer; + } + + public long getFileSize() { + return fileSize; + } + public void setFileSize(long fileSize) { + this.fileSize = fileSize; + } + + @Override + public String toString() { + ToStringBuilder builder = new ToStringBuilder(this); + builder.append("fileId", fileId); + builder.append("docAreaPointer", docAreaPointer); + builder.append("productType", productType); + builder.append("fileType", fileType); + builder.append("majorVersion", majorVersion); + builder.append("minorVersion", minorVersion); + builder.append("encrypted", encrypted); + builder.append("indexAreaPointer", indexAreaPointer); + builder.append("fileSize", fileSize); + return builder.toString(); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + + (int) (docAreaPointer ^ (docAreaPointer >>> 32)); + result = prime * result + (encrypted ? 1231 : 1237); + result = prime * result + ((fileId == null) ? 0 : fileId.hashCode()); + result = prime * result + (int) (fileSize ^ (fileSize >>> 32)); + result = prime * result + fileType; + result = prime * result + indexAreaPointer; + result = prime * result + majorVersion; + result = prime * result + minorVersion; + result = prime * result + productType; + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (!(obj instanceof WP6FileHeader)) { + return false; + } + WP6FileHeader other = (WP6FileHeader) obj; + if (docAreaPointer != other.docAreaPointer) { + return false; + } + if (encrypted != other.encrypted) { + return false; + } + if (fileId == null) { + if (other.fileId != null) { + return false; + } + } else if (!fileId.equals(other.fileId)) { + return false; + } + if (fileSize != other.fileSize) { + return false; + } + if (fileType != other.fileType) { + return false; + } + if (indexAreaPointer != other.indexAreaPointer) { + return false; + } + if (majorVersion != other.majorVersion) { + return false; + } + if (minorVersion != other.minorVersion) { + return false; + } + if (productType != other.productType) { + return false; + } + return true; + } + + +} http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java new file mode 100644 index 0000000..1a2198d --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WP6TextExtractor.java @@ -0,0 +1,218 @@ +/* Copyright 2015-2016 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.wordperfect; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Extracts text from a WordPerfect document according to WP6 File Format. + * This format appears to be compatible with more recent versions too. + * @author Pascal Essiembre + */ +public class WP6TextExtractor { + + public void extract( + InputStream input, XHTMLContentHandler xhtml, Metadata metadata) + throws IOException, SAXException { + WPInputStream in = new WPInputStream(input); + + WP6FileHeader header = parseFileHeader(in); + + applyMetadata(header, metadata); + + // For text extraction we can safely ignore WP Index Area and + // Packet Data Area and jump right away to Document Area. + extractDocumentText(in, header.getDocAreaPointer(), xhtml); + + } + + private void applyMetadata(WP6FileHeader header, Metadata metadata) { + metadata.set(WordPerfect.FILE_SIZE, + Long.toString(header.getFileSize())); + metadata.set(WordPerfect.FILE_ID, header.getFileId()); + metadata.set(WordPerfect.PRODUCT_TYPE, header.getProductType()); + metadata.set(WordPerfect.FILE_TYPE, header.getFileType()); + metadata.set(WordPerfect.MAJOR_VERSION, header.getMajorVersion()); + metadata.set(WordPerfect.MINOR_VERSION, header.getMinorVersion()); + metadata.set(WordPerfect.ENCRYPTED, + Boolean.toString(header.isEncrypted())); + } + + private void extractDocumentText( + WPInputStream in, long offset, XHTMLContentHandler xhtml) + throws IOException, SAXException { + xhtml.startElement("p"); + + // Move to offset (for some reason skip() did not work). + for (int i = 0; i < offset; i++) { + in.readWPByte(); + } + + int chunk = 4096; + StringBuilder out = new StringBuilder(chunk); + + int c; + while ((c = in.read()) != -1) { + if (c > 0 && c <= 32) { + out.append(WP6Constants.DEFAULT_EXTENDED_INTL_CHARS[c]); + } else if (c >= 33 && c <= 126) { + out.append((char) c); + } else if (c == 128) { + out.append(' '); // Soft space + } else if (c == 129) { + out.append('\u00A0'); // Hard space + } else if (c == 129) { + out.append('-'); // Hard hyphen + } else if (c == 135 || c == 137) { + out.append('\n'); // Dormant Hard return + } else if (c == 138) { + // skip to closing pair surrounding page number + skipUntilChar(in, 139); + } else if (c == 198) { + // end of cell + out.append('\t'); + } else if (c >= 180 && c <= 207) { + out.append('\n'); + } else if (c >= 208 && c <= 239) { + // Variable-Length Multi-Byte Functions + int subgroup = in.read(); + int functionSize = in.readWPShort(); + for (int i = 0; i < functionSize - 4; i++) { + in.read(); + } + + // End-of-Line group + if (c == 208) { + if (subgroup >= 1 && subgroup <= 3) { + out.append(' '); + } else if (subgroup == 10) { + // end of cell + out.append('\t'); + } else if (subgroup >= 4 && subgroup <= 19) { + out.append('\n'); + } else if (subgroup >= 20 && subgroup <= 22) { + out.append(' '); + } else if (subgroup >= 23 && subgroup <= 28) { + out.append('\n'); + } + } else if (c == 213) { + out.append(' '); + } else if (c == 224) { + out.append('\t'); + } + //TODO Are there functions containing data? Like footnotes? + + } else if (c == 240) { + // extended char + int charval = in.read(); + int charset = in.read(); + in.read(); // closing character + + //TODO implement all charsets + if (charset == 4 || charset == 5) { + out.append( + WP6Constants.EXTENDED_CHARSETS[charset][charval]); + } else { + out.append("[TODO:charset" + charset + "]"); + } + } else if (c >= 241 && c <= 254) { + skipUntilChar(in, c); + } else if (c == 255) { + skipUntilChar(in, c); + } + + if (out.length() >= chunk) { + xhtml.characters(out.toString()); + out.setLength(0); + } + } + + // Ignored codes above 127: + + // 130,131,133: soft hyphens + // 134: invisible return in line + // 136: soft end of center/align + // 140: style separator mark + // 141,142: start/end of text to skip + // 143: exited hyphenation + // 144: cancel hyphenation + // 145-151: match functions + // 152-179: unknown/ignored + // 255: reserved, cannot be used + + xhtml.characters(out.toString()); + out.setLength(0); + xhtml.endElement("p"); + } + + // Skips until the given character is encountered. + private int skipUntilChar(WPInputStream in, int targetChar) + throws IOException { + int count = 0; + int c; + while ((c = in.read()) != -1) { + count++; + if (c == targetChar) { + return count; + } + } + return count; + } + + private WP6FileHeader parseFileHeader(WPInputStream in) + throws IOException { + WP6FileHeader header = new WP6FileHeader(); + + // File header + in.mark(30); + header.setFileId(in.readWPString(4)); // 1-4 + header.setDocAreaPointer(in.readWPLong()); // 5-8 + header.setProductType(in.read()); // 9 + header.setFileType(in.readWPChar()); // 10 + header.setMajorVersion(in.read()); // 11 + header.setMinorVersion(in.read()); // 12 + header.setEncrypted(in.readWPShort() != 0); // 13-14 + header.setIndexAreaPointer(in.readWPShort()); // 15-16 + try { + in.skip(4); // 4 reserved bytes: skip // 17-20 + header.setFileSize(in.readWPLong()); // 21-24 + } catch (IOException e) { + // May fail if not extended error, which is fine. + } + in.reset(); + + //TODO header may be shared between corel products, so move validation + //specific to each product elsewhere? + //TODO convert to logs only, and let it fail elsewhere? +// if (!WP6Constants.WP6_FILE_ID.equals(header.getFileId())) { +// throw new IOException("Not a WordPerfect file. File must start " +// + "with " + WP6Constants.WP6_FILE_ID + " but was " +// + header.getFileId()); +// } +// if (WP6Constants.WP6_PRODUCT_TYPE != header.getProductType()) { +// throw new IOException("Not a WordPerfect file. Product type " +// + "must be " + WP6Constants.WP6_PRODUCT_TYPE + " but was " +// + header.getProductType()); +// } + //TODO perform file type validation? + return header; + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java new file mode 100644 index 0000000..2da276b --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WPInputStream.java @@ -0,0 +1,196 @@ +/* Copyright 2015-2016 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.wordperfect; + +import java.io.BufferedInputStream; +import java.io.DataInputStream; +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.lang.StringUtils; + +/** + * {@link InputStream} wrapper adding WordPerfect-specific byte-reading methods. + * @author Pascal Essiembre + */ +public class WPInputStream extends InputStream { + + private final DataInputStream in; + + /** + * Constructor. + * @param in input stream + */ + public WPInputStream(InputStream in) { + BufferedInputStream bis = null; + if (BufferedInputStream.class.isAssignableFrom(in.getClass())) { + bis = (BufferedInputStream) in; + } else { + bis = new BufferedInputStream(in); + } + this.in = new DataInputStream(bis); + } + + /** + * Reads a WordPerfect "short": a 2 bytes (16-bit) unsigned value in + * reverse order. + * @return an integer value + * @throws IOException if not enough bytes remain + */ + public int readWPShort() throws IOException { + int ch1 = in.read(); + int ch2 = in.read(); + if ((ch1 | ch2) < 0) { + throw new EOFException(); + } + return (ch2 << 8) + (ch1 << 0); + } + + /** + * Reads a WordPerfect "long": a 4 bytes (32-bit) unsigned value in + * reverse order. + * @return a long value + * @throws IOException if not enough bytes remain + */ + public long readWPLong() throws IOException { + int ch1 = in.read(); + int ch2 = in.read(); + int ch3 = in.read(); + int ch4 = in.read(); + if ((ch1 | ch2 | ch3 | ch4) < 0) { + throw new EOFException(); + } + return ((ch4 << 24) + (ch3 << 16) + (ch2 << 8) + (ch1 << 0)); + } + + /** + * Reads a WordPerfect byte (8-bit). + * @return byte value + * @throws IOException if not enough bytes remain + */ + public byte readWPByte() throws IOException { + return in.readByte(); + } + + /** + * Skips the specified number of WordPerfect byte (8-bit). + * @param numOfBytes number of bytes to skip + * @throws IOException if not enough bytes remain + */ + public void skipWPByte(int numOfBytes) throws IOException { + for (int i = 0; i < numOfBytes; i++) { + readWPByte(); + } + } + + /** + * Reads a WordPerfect character (8-bit). + * @return character + * @throws IOException if not enough bytes remain + */ + public char readWPChar() throws IOException { + return (char) in.read(); + } + + /** + * Reads a WordPerfect string of specified length (1 byte per character). + * @param length how many characters to read + * @return a string + * @throws IOException if not enough bytes remain + */ + public String readWPString(int length) throws IOException { + char[] chars = new char[length]; + for (int i = 0; i < length; i++) { + int c = in.read(); + if (c == -1) { + throw new EOFException(); + } + chars[i] = (char) c; + } + return new String(chars); + } + + /** + * Reads a series of bytes of the specified length, converting + * each byte to its hexadecimal representation. + * converting each characters to . + * @param numOfBytes how many byte to read + * @return an hexadecimal string + * @throws IOException if not enough bytes remain + */ + public String readWPHexString(int numOfBytes) throws IOException { + StringBuilder b = new StringBuilder(); + for (int i = 0; i < numOfBytes; i++) { + b.append(readWPHex()); + } + return b.toString(); + } + + /** + * Reads the next byte and returns it as an hexadecimal value. + * @return hexadecimal string for a single byte + * @throws IOException if not enough bytes remain + */ + public String readWPHex() throws IOException { + return StringUtils.leftPad(Integer.toString(read(), 16), 2, '0'); + } + + + @Override + public int read() throws IOException { + return in.read(); + } + + @Override + public int read(byte[] b) throws IOException { + return in.read(b); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + return in.read(b, off, len); + } + + @Override + public long skip(long n) throws IOException { + return in.skip(n); + } + + @Override + public int available() throws IOException { + return in.available(); + } + + @Override + public void close() throws IOException { + in.close(); + } + + @Override + public synchronized void mark(int readlimit) { + in.mark(readlimit); + } + + @Override + public synchronized void reset() throws IOException { + in.reset(); + } + + @Override + public boolean markSupported() { + return in.markSupported(); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfect.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfect.java b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfect.java new file mode 100644 index 0000000..aadbd35 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfect.java @@ -0,0 +1,69 @@ +/* Copyright 2016 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.wordperfect; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; + +/** + * WordPerfect properties collection. + * @author Pascal Essiembre + */ +public interface WordPerfect { + public static final String WORDPERFECT_METADATA_NAME_PREFIX = "wordperfect"; + + /** + * File size as defined in document header. + */ + Property FILE_SIZE = Property.internalText( + WORDPERFECT_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileSize"); + /** + * File identifier. + */ + Property FILE_ID = Property.internalText( + WORDPERFECT_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileId"); + /** + * Product type. + */ + Property PRODUCT_TYPE = Property.internalInteger( + WORDPERFECT_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "ProductType"); + /** + * File type. + */ + Property FILE_TYPE = Property.internalInteger( + WORDPERFECT_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "FileType"); + /** + * Major version. + */ + Property MAJOR_VERSION = Property.internalInteger( + WORDPERFECT_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "MajorVersion"); + /** + * Minor version. + */ + Property MINOR_VERSION = Property.internalInteger( + WORDPERFECT_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "MinorVersion"); + /** + * Is encrypted?. + */ + Property ENCRYPTED = Property.internalBoolean( + WORDPERFECT_METADATA_NAME_PREFIX + + Metadata.NAMESPACE_PREFIX_DELIMITER + "Encrypted"); +} http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfectParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfectParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfectParser.java new file mode 100644 index 0000000..105e803 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/wordperfect/WordPerfectParser.java @@ -0,0 +1,74 @@ +/* Copyright 2016 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.wordperfect; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * <p>Parser for Corel WordPerfect documents. Targets WP6 File Format + * but appears to be compatible with more recent versions too.</p> + * @author Pascal Essiembre + */ +public class WordPerfectParser extends AbstractParser { + + private static final long serialVersionUID = 8941810225917012232L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("vnd.wordperfect"), + MediaType.application("wordperfect"), + MediaType.application("wordperfect5.1"), + MediaType.application("wordperfect6.0"), + MediaType.application("wordperfect6.1"), + MediaType.application("x-corel-wordperfect")))); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + if (metadata.get(Metadata.CONTENT_TYPE) == null) { + metadata.set(Metadata.CONTENT_TYPE, "application/wordperfect"); + } + + XHTMLContentHandler xhtml = + new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + WP6TextExtractor extractor = new WP6TextExtractor(); + extractor.extract(stream, xhtml, metadata); + + xhtml.endDocument(); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser index 4d6e6d4..602ee2c 100644 --- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser @@ -57,6 +57,8 @@ org.apache.tika.parser.pkg.RarParser org.apache.tika.parser.rtf.RTFParser org.apache.tika.parser.txt.TXTParser org.apache.tika.parser.video.FLVParser +org.apache.tika.parser.wordperfect.QuattroProParser +org.apache.tika.parser.wordperfect.WordPerfectParser org.apache.tika.parser.xml.DcXMLParser org.apache.tika.parser.dif.DIFParser org.apache.tika.parser.xml.FictionBookParser http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java new file mode 100644 index 0000000..ea5d12f --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/QuattroProTest.java @@ -0,0 +1,58 @@ +/* Copyright 2016 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.wordperfect; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.FileInputStream; +import java.io.StringWriter; + +import org.apache.tika.Tika; +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.WriteOutContentHandler; +import org.junit.Test; + +/** + * Junit test class for the {@link WordPerfectParser}. + * @author Pascal Essiembre + */ +public class QuattroProTest extends TikaTest { + + private Tika tika = new Tika(); + + //TODO add testWB/testQUATTRO.wb3 if .wb? files get supported + + @Test + public void testQPW() throws Exception { + File file = getResourceAsFile("/test-documents/testQUATTRO.qpw"); + + Metadata metadata = new Metadata(); + StringWriter writer = new StringWriter(); + tika.getParser().parse( + new FileInputStream(file), + new WriteOutContentHandler(writer), + metadata, + new ParseContext()); + String content = writer.toString(); + + assertEquals("application/x-quattro-pro", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length); + assertContains("This is an example spreadsheet", content); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java new file mode 100644 index 0000000..c3af274 --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/wordperfect/WordPerfectTest.java @@ -0,0 +1,56 @@ +/* Copyright 2016 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.wordperfect; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.FileInputStream; +import java.io.StringWriter; + +import org.apache.tika.Tika; +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.WriteOutContentHandler; +import org.junit.Test; + +/** + * Junit test class for the {@link WordPerfectParser}. + * @author Pascal Essiembre + */ +public class WordPerfectTest extends TikaTest { + + private Tika tika = new Tika(); + + @Test + public void testWordPerfectParser() throws Exception { + File file = getResourceAsFile("/test-documents/testWordPerfect.wpd"); + + Metadata metadata = new Metadata(); + StringWriter writer = new StringWriter(); + tika.getParser().parse( + new FileInputStream(file), + new WriteOutContentHandler(writer), + metadata, + new ParseContext()); + String content = writer.toString(); + + assertEquals("application/vnd.wordperfect", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length); + assertContains("test test", content); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/87c2ef31/tika-parsers/src/test/resources/test-documents/testWordPerfect.wpd ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testWordPerfect.wpd b/tika-parsers/src/test/resources/test-documents/testWordPerfect.wpd new file mode 100644 index 0000000..4c6ae5d Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWordPerfect.wpd differ
