This is an automated email from the ASF dual-hosted git repository. schor pushed a commit to branch enhancement/UIMA-6128-xml-1-1-merge-v2-into-v3 in repository https://gitbox.apache.org/repos/asf/uima-uimaj.git
commit caae4c58e123e9a3a2d1198ce0f1e80bc931a692 Merge: e1aa503 a62793d Author: Marshall Schor <m...@schor.com> AuthorDate: Fri Jan 17 10:13:05 2020 -0500 Merge branch 'master-v2' of https://github.com/apache/uima-uimaj into enhancement/UIMA-6128-xml-1-1-merge-v2-into-v3 Add XCAS_1_1 support, was left out of v2 by accident. .gitignore | 2 + .../java/org/apache/uima/cas/SerialFormat.java | 11 + .../org/apache/uima/cas/impl/XCASSerializer.java | 27 +- .../org/apache/uima/cas/impl/XmiCasSerializer.java | 59 +- .../main/java/org/apache/uima/util/CasIOUtils.java | 1090 ++++++++++---------- .../apache/uima/cas/impl/XmiCasSerializerTest.java | 8 +- 6 files changed, 651 insertions(+), 546 deletions(-) diff --cc uimaj-core/src/main/java/org/apache/uima/cas/impl/XCASSerializer.java index 1b9c7a4,e8acdd3..b9cde54 --- a/uimaj-core/src/main/java/org/apache/uima/cas/impl/XCASSerializer.java +++ b/uimaj-core/src/main/java/org/apache/uima/cas/impl/XCASSerializer.java @@@ -30,10 -24,11 +30,12 @@@ import java.util.IdentityHashMap import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Vector; +import java.util.Map.Entry; + import javax.xml.transform.OutputKeys; + import org.apache.uima.UimaContext; +import org.apache.uima.UimaSerializable; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Feature; import org.apache.uima.cas.TypeSystem; diff --cc uimaj-core/src/main/java/org/apache/uima/util/CasIOUtils.java index 041e354,9447830..8ba2802 --- a/uimaj-core/src/main/java/org/apache/uima/util/CasIOUtils.java +++ b/uimaj-core/src/main/java/org/apache/uima/util/CasIOUtils.java @@@ -1,542 -1,542 +1,548 @@@ - /* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - package org.apache.uima.util; - - import java.io.BufferedInputStream; - import java.io.Closeable; - import java.io.DataInputStream; - import java.io.IOException; - import java.io.InputStream; - import java.io.ObjectInputStream; - import java.io.ObjectOutputStream; - import java.io.OutputStream; - import java.net.URL; - import java.nio.charset.StandardCharsets; - - import org.apache.uima.UIMARuntimeException; - import org.apache.uima.cas.CAS; - import org.apache.uima.cas.CASRuntimeException; - import org.apache.uima.cas.SerialFormat; - import org.apache.uima.cas.TypeSystem; - import org.apache.uima.cas.admin.CASMgr; - import org.apache.uima.cas.impl.AllowPreexistingFS; - import org.apache.uima.cas.impl.BinaryCasSerDes; - import org.apache.uima.cas.impl.BinaryCasSerDes4; - import org.apache.uima.cas.impl.CASCompleteSerializer; - import org.apache.uima.cas.impl.CASImpl; - import org.apache.uima.cas.impl.CASMgrSerializer; - import org.apache.uima.cas.impl.CASSerializer; - import org.apache.uima.cas.impl.CommonSerDes; - import org.apache.uima.cas.impl.CommonSerDes.Header; - import org.apache.uima.cas.impl.Serialization; - import org.apache.uima.cas.impl.TypeSystemImpl; - import org.apache.uima.cas.impl.XCASSerializer; - import org.apache.uima.cas.impl.XmiCasSerializer; - import org.xml.sax.SAXException; - - import static org.apache.uima.cas.impl.Serialization.serializeCAS; - import static org.apache.uima.cas.impl.Serialization.serializeWithCompression; - - /** - * <p>A collection of static methods aimed at making it easy to</p> - * <ul> - * <li>save and load CASes, and to</li> - * <li>optionally include the CAS's Type System (abbreviated TS (only available for Compressed Form 6)) and optionally also include the CAS's indexes definition.</li> - * <li>The combinaton of Type System and Indexes definition is called TSI. - * <ul> - * <li>The TSI's purpose: to replace the CAS's existing type system and index definition.</li> - * <li>The TS's purpose: to specify the type system used in the serialized data for format Compressed Form 6, in order to allow deserializing into some other type system in the CAS, leniently.</li> - * </ul> - * </li> - * </ul> - * - * <p>TSI information can be</p> - * <ul> - * <li>embedded</li> - * <li>externally supplied (via another input source to the load)</li> - * <li>both embedded and externally supplied. In this case the embedded takes precedence.</li> - * </ul> - * - * <p>TS information is available embedded, for COMPRESSED_FILTERED_TS format, - * and also from embedded or external TSI information (since it also contains the type system information).</p> - * - * <p>When an external TSI is supplied while loading Compressed Form 6,</p> - * <ul> - * <li>for COMPRESSED_FILTERED_TS - * <ul> - * <li>it uses the embedded TS for decoding</li> - * <li>it uses the external TSI to replace the CAS's existing type system and index definition if CasLoadMode == REINIT.</li> - * </ul> - * </li> - * <li>for COMPRESSED_FILTERED_TSI - * <ul> - * <li>the external TSI is ignored, the embedded one overrides, but otherwise operates as above.</li> - * </ul> - * </li> - * <li>for COMPRESSED_FILTERED - * <ul> - * <li>the external TSI's type system part is used for decoding.</li> - * <li>if CasLoadMode == REINIT, the external TSI is also used to replace the CAS's existing type system and index definition.</li> - * </ul> - * </li> - * </ul> - * - * <p>Compressed Form 6 loading decoding type system is picked from these sources, in this order:</p> - * <ul> - * <li>a passed in type system</li> - * <li>an embedded TS or TSI</li> - * <li>an external TSI</li> - * <li>the CAS's type system</li> - * </ul> - * - * <p>The serialization formats supported here are specified in the SerialFormat enum.</p> - * - * <p>The <code>load </code>api's automatically use the appropriate deserializers, based on the input data format.</p> - * - * <p>Loading inputs may be supplied as URLs or as an appropriately buffered InputStream.</p> - * - * <p>Note: you can use Files or Paths by converting these to URLs:</p> - * <ul> - * <li><code>URL url = a_path.toUri().toURL();</code></li> - * <li><code>URL url = a_file.toUri().toURL();</code></li> - * </ul> - * - * <p>When loading, an optional CasLoadMode enum value maybe specified to indicate</p> - * <ul> - * <li>LENIENT loading - used with XCas and XMI data data sources to silently ignore types and features present in the serialized form, but not in the receiving type system.</li> - * <li>REINIT - used with Compressed Form 6 loading to indicate that if no embedded TSI information is available, the external TSI is to be used to replace the CAS's existing type system and index definition.</li> - * </ul> - * - * <p style="padding-left: 30px;">For more details, see the Javadocs for CasLoadMode.</p> - * - * <p>When TS or TSI information is saved, it is either saved in the same destination (e.g. file or stream), or in a separate one.</p> - * <ul> - * <li>The serialization formats ending in _TSI and _TS support saving the TSI (or TS) in the same destination.</li> - * <li>The save APIs for other formats can optionally also save the TSI into a separate (second) OutputStream.</li> - * </ul> - * - * <p>Summary of APIs for saving:</p> - * <pre style="padding-left: 30px;"> - * <code>save(aCAS, outputStream, aSerialFormat)</code> - * <code>save(aCAS, outputStream, tsiOutputStream, aSerialFormat)</code></pre> - * - * <p>Summary of APIs for loading:</p> - * <pre style="padding-left: 30px;"> - * <code>load(aURL , aCas)</code> - * <code>load(inputStream, aCas)</code> - * <code>load(inputStream, aCas, typeSystem)</code> // typeSystem used for decoding Compressed Form 6 - * <code>load(inputStream, tsiInputStream, aCas)</code></pre> - * <pre style="padding-left: 30px;"> - * <code>load(aURL , tsiURL , aCAS, casLoadMode) - the second URL is for loading a separately-stored TSI</code> - * <code>load(inputStream, tsiInputStream, aCAS, aCasLoadMode)</code> - * <code>load(aURL , tsiURL , aCAS, lenient) - lenient is used to set the CasLoadMode to LENIENT or DEFAULT</code> - * <code>load(inputStream, tsiInputStream, aCAS, lenient)</code></pre> - */ - - public class CasIOUtils { - - /** - * Loads a Cas from a URL source. - * For SerialFormats ending with _TSI except for COMPRESSED_FILTERED_TSI, - * the CAS's type system and indexes definition are replaced. - * CasLoadMode is DEFAULT. - * - * @param casUrl - * The url containing the CAS - * @param aCAS - * The CAS that should be filled - * @return the SerialFormat of the loaded CAS - * @throws IOException - * - Problem loading from given URL - */ - public static SerialFormat load(URL casUrl, CAS aCAS) throws IOException { - - return load(casUrl, null, aCAS, CasLoadMode.DEFAULT); - } - - /** - * Loads a CAS from a URL source. The format is determined from the content. - * - * If the value of tsiUrl is null it is ignored. - * - * @param casUrl - * The url to deserialize the CAS from - * @param tsiUrl - * null or an optional url to deserialize the type system and index definitions from - * @param aCAS - * The CAS that should be filled - * @param casLoadMode specifies how to handle reinitialization and lenient loading - * see the Javadocs for CasLoadMode - * @return the SerialFormat of the loaded CAS - * @throws IOException Problem loading - */ - public static SerialFormat load(URL casUrl, URL tsiUrl, CAS aCAS, CasLoadMode casLoadMode) - throws IOException { - InputStream casIS = new BufferedInputStream(casUrl.openStream()); - InputStream tsIS = (tsiUrl == null) ? null : new BufferedInputStream(tsiUrl.openStream()); - try { - return load(casIS, tsIS, aCAS, casLoadMode); - } finally { - closeQuitely(casIS); - closeQuitely(tsIS); - } - } - - /** - * Loads a CAS from a URL source. The format is determined from the content. - * For SerialFormats ending with _TSI except for COMPRESSED_FILTERED_TSI, - * the CAS's type system and indexes definition are replaced. - * CasLoadMode is set according to the leniently flag. - * - * @param casUrl - * The url to deserialize the CAS from - * @param tsiUrl - * The optional url to deserialize the type system and index definitions from - * @param aCAS - * The CAS that should be filled - * @param leniently true means do lenient loading - * @return the SerialFormat of the loaded CAS - * @throws IOException Problem loading - */ - public static SerialFormat load(URL casUrl, URL tsiUrl, CAS aCAS, boolean leniently) - throws IOException { - return load(casUrl, tsiUrl, aCAS, leniently ? CasLoadMode.LENIENT : CasLoadMode.DEFAULT); - } - - /** - * Loads a Cas from an Input Stream. The format is determined from the content. - * For SerialFormats ending with _TSI except for COMPRESSED_FILTERED_TSI, - * the CAS's type system and indexes definition are replaced. - * CasLoadMode is DEFAULT. - * - * @param casInputStream - * The input stream containing the CAS. Caller should buffer this appropriately. - * @param aCAS - * The CAS that should be filled - * @return the SerialFormat of the loaded CAS - * @throws IOException - * - Problem loading from given InputStream - */ - public static SerialFormat load(InputStream casInputStream, CAS aCAS) throws IOException { - return load(casInputStream, null, aCAS, CasLoadMode.DEFAULT); - } - - /** - * Loads a CAS from an Input Stream. The format is determined from the content. - * - * For SerialFormats ending with _TSI the embedded value is used instead of any supplied external TSI information. - * TSI information is available either via embedded value, or if a non-null input is passed for tsiInputStream. - * - * If TSI information is available, the CAS's type system and indexes definition are replaced, - * except for SerialFormats COMPRESSED_FILTERED, COMPRESSED_FILTERED_TS, and COMPRESSED_FILTERED_TSI. - * - * The CasLoadMode is DEFAULT. - * - * @param casInputStream - - * @param tsiInputStream - - * @param aCAS - - * @return - - * @throws IOException - - */ - public static SerialFormat load(InputStream casInputStream, InputStream tsiInputStream, CAS aCAS) throws IOException { - return load(casInputStream, tsiInputStream, aCAS, CasLoadMode.DEFAULT); - } - - /** - * Loads a CAS from an Input Stream. The format is determined from the content. - * - * For SerialFormats ending with _TSI the embedded value is used instead of any supplied external TSI information. - * TSI information is available either via embedded value, or if a non-null input is passed for tsiInputStream. - * - * If TSI information is available, the CAS's type system and indexes definition are replaced, - * except for SerialFormats COMPRESSED_FILTERED, COMPRESSED_FILTERED_TS, and COMPRESSED_FILTERED_TSI. - * - * The CasLoadMode is set to LENIENT if the leniently flag is true; otherwise it is set to DEFAULT. - * - * @param casInputStream - - * @param tsiInputStream - - * @param aCAS - - * @param leniently - - * @return - - * @throws IOException - - */ - public static SerialFormat load(InputStream casInputStream, InputStream tsiInputStream, CAS aCAS, boolean leniently) throws IOException { - return load(casInputStream, tsiInputStream, aCAS, leniently ? CasLoadMode.LENIENT : CasLoadMode.DEFAULT); - } - - /** - * Loads a CAS from an Input Stream. The format is determined from the content. - * For formats of ending in _TSI SERIALIZED_TSI or COMPRESSED_FILTERED_TSI, - * the type system and index definitions are read from the cas input source; - * the value of tsiInputStream is ignored. - * - * For other formats, if the tsiInputStream is not null, - * type system and index definitions are read from that source. - * - * If TSI information is available, the CAS's type system and indexes definition are replaced, - * except for SerialFormats COMPRESSED_FILTERED, COMPRESSED_FILTERED_TS, and COMPRESSED_FILTERED_TSI. - * - * If the CasLoadMode == REINIT, then the TSI information is also used for these 3 formats to replace the CAS's definitions. - * - * @param casInputStream - * The input stream containing the CAS, appropriately buffered. - * @param tsiInputStream - * The optional input stream containing the type system, appropriately buffered. - * This is only used if it is non null and - * - the casInputStream does not already come with an embedded CAS Type System and Index Definition, or - * - the serial format is COMPRESSED_FILTERED_TSI - * @param aCAS - * The CAS that should be filled - * @param casLoadMode specifies loading alternative like lenient and reinit, see CasLoadMode. - * @return the SerialFormat of the loaded CAS - * @throws IOException - * - Problem loading from given InputStream - */ - public static SerialFormat load(InputStream casInputStream, InputStream tsiInputStream, CAS aCAS, - CasLoadMode casLoadMode) throws IOException { - return load(casInputStream, tsiInputStream, aCAS, casLoadMode, null); - } - - /** - * This load variant can be used for loading Form 6 compressed CASes where the - * type system to use to deserialize is provided as an argument. It can also load other formats, - * where its behavior is identical to load(casInputStream, aCas). - * - * Loads a CAS from an Input Stream. The format is determined from the content. - * For SerialFormats of ending in _TSI SERIALIZED_TSI or COMPRESSED_FILTERED_TSI, - * the type system and index definitions are read from the cas input source; - * the value of typeSystem is ignored. - * - * For COMPRESSED_FILTERED_xxx formats, if the typeSystem is not null, - * the typeSystem is used for decoding. - * - * If embedded TSI information is available, the CAS's type system and indexes definition are replaced, - * except for SerialFormats COMPRESSED_FILTERED, COMPRESSED_FILTERED_TS, and COMPRESSED_FILTERED_TSI. - * - * To replace the CAS's type system and indexes definition for these, use a load form which - * has the CasLoadMode argument, and set this to REINIT. - * - * @param casInputStream - * The input stream containing the CAS, appropriately buffered. - * @param aCAS - * The CAS that should be filled - * @param typeSystem the type system to use for decoding the serialized form, must be non-null - * @return the SerialFormat of the loaded CAS - * @throws IOException Problem loading from given InputStream - */ - public static SerialFormat load(InputStream casInputStream, CAS aCAS, TypeSystem typeSystem) throws IOException { - return load(casInputStream, null, aCAS, CasLoadMode.DEFAULT, (TypeSystemImpl) typeSystem); - } - - private static SerialFormat load(InputStream casInputStream, InputStream tsiInputStream, CAS aCAS, - CasLoadMode casLoadMode, TypeSystemImpl typeSystem) throws IOException { - - if (!casInputStream.markSupported()) { - casInputStream = new BufferedInputStream(casInputStream); - } - - CASImpl casImpl = (CASImpl) aCAS; - BinaryCasSerDes bcsd = casImpl.getBinaryCasSerDes(); - - // scan the first part of the file for known formats - casInputStream.mark(6); - byte[] firstPartOfFile = new byte[6]; - int bytesReadCount = casInputStream.read(firstPartOfFile); - casInputStream.reset(); - String start = new String(firstPartOfFile, 0, bytesReadCount, StandardCharsets.UTF_8).toLowerCase(); - - if (start.startsWith("<?xml ")) { // could be XCAS or XMI - try { - bcsd.setupCasFromCasMgrSerializer(readCasManager(tsiInputStream)); - // next call decides on XMI or XCAS via content - return XmlCasDeserializer.deserializeR(casInputStream, aCAS, casLoadMode == CasLoadMode.LENIENT); - } catch (SAXException e) { - throw new UIMARuntimeException(e); - } - } - - // Not an XML file, decode as binary file - DataInputStream deserIn = CommonSerDes.maybeWrapToDataInputStream(casInputStream); - if (CommonSerDes.isBinaryHeader(deserIn)) { - - /******************************************* - * Binary, Compressed Binary (form 4 or 6) - ******************************************/ - Header h = CommonSerDes.readHeader(deserIn); - return bcsd.reinit(h, casInputStream, readCasManager(tsiInputStream), casLoadMode, null, AllowPreexistingFS.allow, typeSystem); - - } else { - - /****************************** - * Java Object loading - ******************************/ - ObjectInputStream ois = new ObjectInputStream(casInputStream); - try { - Object o = ois.readObject(); - if (o instanceof CASSerializer) { - bcsd.setupCasFromCasMgrSerializer(readCasManager(tsiInputStream)); - bcsd.reinit((CASSerializer) o); // deserialize from object - return SerialFormat.SERIALIZED; - } else if (o instanceof CASCompleteSerializer) { - // with a type system use that, ignore any supplied via tsiInputStream - bcsd.reinit((CASCompleteSerializer) o); - return SerialFormat.SERIALIZED_TSI; - } else { - /**Unrecognized serialized CAS format*/ - throw new CASRuntimeException(CASRuntimeException.UNRECOGNIZED_SERIALIZED_CAS_FORMAT); - } - } catch (ClassNotFoundException e) { - /**Unrecognized serialized CAS format*/ - throw new CASRuntimeException(CASRuntimeException.UNRECOGNIZED_SERIALIZED_CAS_FORMAT); - } - } - } - - /** - * Write the CAS in the specified format. - * - * @param aCas - * The CAS that should be serialized and stored - * @param docOS - * The output stream for the CAS - * @param format - * The SerialFormat in which the CAS should be stored. - * @throws IOException - * - Problem saving to the given InputStream - */ - public static void save(CAS aCas, OutputStream docOS, SerialFormat format) throws IOException { - save(aCas, docOS, null, format); - } - - /** - * Write the CAS in the specified format. If the format does not include typesystem information - * and the optional output stream of the typesystem is specified, then the typesystem information - * is written there. - * - * @param aCas - * The CAS that should be serialized and stored - * @param docOS - * The output stream for the CAS, with appropriate buffering - * @param tsiOS - * Optional output stream for type system information. Only used if the format does not - * support storing typesystem information directly in the main output file. - * @param format - * The SerialFormat in which the CAS should be stored. - * @throws IOException - * - Problem saving to the given InputStream - */ - public static void save(CAS aCas, OutputStream docOS, OutputStream tsiOS, SerialFormat format) - throws IOException { - boolean typeSystemWritten = false; - try { - switch (format) { - case XMI: - XmiCasSerializer.serialize(aCas, docOS); - break; - case XCAS: - XCASSerializer.serialize(aCas, docOS, true); // true = formatted output - break; - case SERIALIZED: - writeJavaObject(Serialization.serializeCAS(aCas), docOS); - break; - case SERIALIZED_TSI: - writeJavaObject(Serialization.serializeCASComplete((CASMgr) aCas), docOS); - typeSystemWritten = true; // Embedded type system - break; - case BINARY: // Java-serialized CAS without type system - serializeCAS(aCas, docOS); - break; - case BINARY_TSI: // Java-serialized CAS without type system - CASSerializer ser = new CASSerializer(); - ser.addCAS((CASImpl) aCas, docOS, true); - break; - case COMPRESSED: // Binary compressed CAS without type system (form 4) - serializeWithCompression(aCas, docOS); - break; - case COMPRESSED_TSI: // Binary compressed CAS without type system (form 4) - new BinaryCasSerDes4((TypeSystemImpl)aCas.getTypeSystem(), false).serializeWithTsi((CASImpl) aCas, docOS); - break; - case COMPRESSED_FILTERED: // Binary compressed CAS (form 6) - serializeWithCompression(aCas, docOS, false, false); - break; - case COMPRESSED_FILTERED_TS: - serializeWithCompression(aCas, docOS, true, false); - typeSystemWritten = true; // Embedded type system - break; - case COMPRESSED_FILTERED_TSI: - serializeWithCompression(aCas, docOS, false, true); - typeSystemWritten = true; // Embedded type system - break; - default: - StringBuilder sb = new StringBuilder(); - for (SerialFormat sf : SerialFormat.values()) { - sb = sb.append(sf.toString()).append(", "); - } - throw new IllegalArgumentException("Unknown format [" + format.name() - + "]. Must be one of: " + sb.toString()); - } - } catch (IOException e) { - throw e; - } catch (Exception e) { - throw new IOException(e); - } - - // Write type system to the separate stream only if it has not already been embedded into the - // main stream - if (tsiOS != null && !typeSystemWritten) { - writeTypeSystem(aCas, tsiOS, true); - } - } - - private static CASMgrSerializer readCasManager(InputStream tsiInputStream) throws IOException { - try { - if (null == tsiInputStream) { - return null; - } - ObjectInputStream is = new ObjectInputStream(tsiInputStream); - return (CASMgrSerializer) is.readObject(); - } catch (ClassNotFoundException e) { - throw new IOException(e); - } - } - - private static void writeJavaObject(Object o, OutputStream aOS) throws IOException { - ObjectOutputStream tsiOS = new ObjectOutputStream(aOS); - tsiOS.writeObject(o); - tsiOS.flush(); - } - - public static void writeTypeSystem(CAS aCas, OutputStream aOS, boolean includeIndexDefs) throws IOException { - writeJavaObject(includeIndexDefs - ? Serialization.serializeCASMgr((CASImpl) aCas) - : Serialization.serializeCASMgrTypeSystemOnly((CASImpl) aCas) - , aOS); - } - - private static void closeQuitely(Closeable closeable) { - if (closeable != null) { - try { - closeable.close(); - } catch (IOException e) { - // do nothing - } - } - } - - } + /* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.uima.util; + -import static org.apache.uima.cas.impl.Serialization.serializeCAS; -import static org.apache.uima.cas.impl.Serialization.serializeWithCompression; - + import java.io.BufferedInputStream; + import java.io.Closeable; + import java.io.DataInputStream; + import java.io.IOException; + import java.io.InputStream; + import java.io.ObjectInputStream; + import java.io.ObjectOutputStream; + import java.io.OutputStream; + import java.net.URL; ++import java.nio.charset.StandardCharsets; + + import org.apache.uima.UIMARuntimeException; + import org.apache.uima.cas.CAS; + import org.apache.uima.cas.CASRuntimeException; + import org.apache.uima.cas.SerialFormat; + import org.apache.uima.cas.TypeSystem; + import org.apache.uima.cas.admin.CASMgr; + import org.apache.uima.cas.impl.AllowPreexistingFS; ++import org.apache.uima.cas.impl.BinaryCasSerDes; + import org.apache.uima.cas.impl.BinaryCasSerDes4; + import org.apache.uima.cas.impl.CASCompleteSerializer; + import org.apache.uima.cas.impl.CASImpl; + import org.apache.uima.cas.impl.CASMgrSerializer; + import org.apache.uima.cas.impl.CASSerializer; + import org.apache.uima.cas.impl.CommonSerDes; + import org.apache.uima.cas.impl.CommonSerDes.Header; + import org.apache.uima.cas.impl.Serialization; + import org.apache.uima.cas.impl.TypeSystemImpl; + import org.apache.uima.cas.impl.XCASSerializer; + import org.apache.uima.cas.impl.XmiCasSerializer; + import org.xml.sax.SAXException; + ++import static org.apache.uima.cas.impl.Serialization.serializeCAS; ++import static org.apache.uima.cas.impl.Serialization.serializeWithCompression; ++ + /** + * <p>A collection of static methods aimed at making it easy to</p> + * <ul> + * <li>save and load CASes, and to</li> + * <li>optionally include the CAS's Type System (abbreviated TS (only available for Compressed Form 6)) and optionally also include the CAS's indexes definition.</li> - * <li>The combinatison of Type System and Indexes definition is called TSI. ++ * <li>The combination of Type System and Indexes definition is called TSI. + * <ul> + * <li>The TSI's purpose: to replace the CAS's existing type system and index definition.</li> + * <li>The TS's purpose: to specify the type system used in the serialized data for format Compressed Form 6, in order to allow deserializing into some other type system in the CAS, leniently.</li> + * </ul> + * </li> + * </ul> + * + * <p>TSI information can be</p> + * <ul> + * <li>embedded</li> + * <li>externally supplied (via another input source to the load)</li> + * <li>both embedded and externally supplied. In this case the embedded takes precedence.</li> + * </ul> + * + * <p>TS information is available embedded, for COMPRESSED_FILTERED_TS format, + * and also from embedded or external TSI information (since it also contains the type system information).</p> + * + * <p>When an external TSI is supplied while loading Compressed Form 6,</p> + * <ul> + * <li>for COMPRESSED_FILTERED_TS + * <ul> + * <li>it uses the embedded TS for decoding</li> + * <li>it uses the external TSI to replace the CAS's existing type system and index definition if CasLoadMode == REINIT.</li> + * </ul> + * </li> + * <li>for COMPRESSED_FILTERED_TSI + * <ul> + * <li>the external TSI is ignored, the embedded one overrides, but otherwise operates as above.</li> + * </ul> + * </li> + * <li>for COMPRESSED_FILTERED + * <ul> + * <li>the external TSI's type system part is used for decoding.</li> + * <li>if CasLoadMode == REINIT, the external TSI is also used to replace the CAS's existing type system and index definition.</li> + * </ul> + * </li> + * </ul> + * + * <p>Compressed Form 6 loading decoding type system is picked from these sources, in this order:</p> + * <ul> + * <li>a passed in type system</li> + * <li>an embedded TS or TSI</li> + * <li>an external TSI</li> + * <li>the CAS's type system</li> + * </ul> + * + * <p>The serialization formats supported here are specified in the SerialFormat enum.</p> + * + * <p>The <code>load </code>api's automatically use the appropriate deserializers, based on the input data format.</p> + * + * <p>Loading inputs may be supplied as URLs or as an appropriately buffered InputStream.</p> + * + * <p>Note: you can use Files or Paths by converting these to URLs:</p> + * <ul> + * <li><code>URL url = a_path.toUri().toURL();</code></li> + * <li><code>URL url = a_file.toUri().toURL();</code></li> + * </ul> + * + * <p>When loading, an optional CasLoadMode enum value maybe specified to indicate</p> + * <ul> + * <li>LENIENT loading - used with XCas and XMI data data sources to silently ignore types and features present in the serialized form, but not in the receiving type system.</li> + * <li>REINIT - used with Compressed Form 6 loading to indicate that if no embedded TSI information is available, the external TSI is to be used to replace the CAS's existing type system and index definition.</li> + * </ul> + * + * <p style="padding-left: 30px;">For more details, see the Javadocs for CasLoadMode.</p> + * + * <p>When TS or TSI information is saved, it is either saved in the same destination (e.g. file or stream), or in a separate one.</p> + * <ul> + * <li>The serialization formats ending in _TSI and _TS support saving the TSI (or TS) in the same destination.</li> + * <li>The save APIs for other formats can optionally also save the TSI into a separate (second) OutputStream.</li> + * </ul> + * + * <p>Summary of APIs for saving:</p> + * <pre style="padding-left: 30px;"> + * <code>save(aCAS, outputStream, aSerialFormat)</code> + * <code>save(aCAS, outputStream, tsiOutputStream, aSerialFormat)</code></pre> + * + * <p>Summary of APIs for loading:</p> + * <pre style="padding-left: 30px;"> + * <code>load(aURL , aCas)</code> + * <code>load(inputStream, aCas)</code> + * <code>load(inputStream, aCas, typeSystem)</code> // typeSystem used for decoding Compressed Form 6 + * <code>load(inputStream, tsiInputStream, aCas)</code></pre> + * <pre style="padding-left: 30px;"> + * <code>load(aURL , tsiURL , aCAS, casLoadMode) - the second URL is for loading a separately-stored TSI</code> + * <code>load(inputStream, tsiInputStream, aCAS, aCasLoadMode)</code> + * <code>load(aURL , tsiURL , aCAS, lenient) - lenient is used to set the CasLoadMode to LENIENT or DEFAULT</code> + * <code>load(inputStream, tsiInputStream, aCAS, lenient)</code></pre> + */ + + public class CasIOUtils { + + /** + * Loads a Cas from a URL source. + * For SerialFormats ending with _TSI except for COMPRESSED_FILTERED_TSI, + * the CAS's type system and indexes definition are replaced. + * CasLoadMode is DEFAULT. + * + * @param casUrl + * The url containing the CAS + * @param aCAS + * The CAS that should be filled + * @return the SerialFormat of the loaded CAS + * @throws IOException + * - Problem loading from given URL + */ + public static SerialFormat load(URL casUrl, CAS aCAS) throws IOException { + + return load(casUrl, null, aCAS, CasLoadMode.DEFAULT); + } - ++ + /** + * Loads a CAS from a URL source. The format is determined from the content. + * + * If the value of tsiUrl is null it is ignored. + * + * @param casUrl + * The url to deserialize the CAS from + * @param tsiUrl + * null or an optional url to deserialize the type system and index definitions from + * @param aCAS + * The CAS that should be filled + * @param casLoadMode specifies how to handle reinitialization and lenient loading + * see the Javadocs for CasLoadMode + * @return the SerialFormat of the loaded CAS + * @throws IOException Problem loading + */ + public static SerialFormat load(URL casUrl, URL tsiUrl, CAS aCAS, CasLoadMode casLoadMode) + throws IOException { + InputStream casIS = new BufferedInputStream(casUrl.openStream()); + InputStream tsIS = (tsiUrl == null) ? null : new BufferedInputStream(tsiUrl.openStream()); + try { + return load(casIS, tsIS, aCAS, casLoadMode); + } finally { + closeQuitely(casIS); + closeQuitely(tsIS); + } + } + + /** + * Loads a CAS from a URL source. The format is determined from the content. + * For SerialFormats ending with _TSI except for COMPRESSED_FILTERED_TSI, + * the CAS's type system and indexes definition are replaced. + * CasLoadMode is set according to the leniently flag. + * + * @param casUrl + * The url to deserialize the CAS from + * @param tsiUrl + * The optional url to deserialize the type system and index definitions from + * @param aCAS + * The CAS that should be filled + * @param leniently true means do lenient loading + * @return the SerialFormat of the loaded CAS + * @throws IOException Problem loading + */ + public static SerialFormat load(URL casUrl, URL tsiUrl, CAS aCAS, boolean leniently) + throws IOException { + return load(casUrl, tsiUrl, aCAS, leniently ? CasLoadMode.LENIENT : CasLoadMode.DEFAULT); + } + + /** + * Loads a Cas from an Input Stream. The format is determined from the content. + * For SerialFormats ending with _TSI except for COMPRESSED_FILTERED_TSI, + * the CAS's type system and indexes definition are replaced. + * CasLoadMode is DEFAULT. + * + * @param casInputStream + * The input stream containing the CAS. Caller should buffer this appropriately. + * @param aCAS + * The CAS that should be filled + * @return the SerialFormat of the loaded CAS + * @throws IOException + * - Problem loading from given InputStream + */ + public static SerialFormat load(InputStream casInputStream, CAS aCAS) throws IOException { + return load(casInputStream, null, aCAS, CasLoadMode.DEFAULT); + } + + /** + * Loads a CAS from an Input Stream. The format is determined from the content. + * + * For SerialFormats ending with _TSI the embedded value is used instead of any supplied external TSI information. + * TSI information is available either via embedded value, or if a non-null input is passed for tsiInputStream. + * + * If TSI information is available, the CAS's type system and indexes definition are replaced, + * except for SerialFormats COMPRESSED_FILTERED, COMPRESSED_FILTERED_TS, and COMPRESSED_FILTERED_TSI. + * + * The CasLoadMode is DEFAULT. + * + * @param casInputStream - + * @param tsiInputStream - + * @param aCAS - + * @return - + * @throws IOException - + */ + public static SerialFormat load(InputStream casInputStream, InputStream tsiInputStream, CAS aCAS) throws IOException { + return load(casInputStream, tsiInputStream, aCAS, CasLoadMode.DEFAULT); + } + + /** + * Loads a CAS from an Input Stream. The format is determined from the content. + * + * For SerialFormats ending with _TSI the embedded value is used instead of any supplied external TSI information. + * TSI information is available either via embedded value, or if a non-null input is passed for tsiInputStream. + * + * If TSI information is available, the CAS's type system and indexes definition are replaced, + * except for SerialFormats COMPRESSED_FILTERED, COMPRESSED_FILTERED_TS, and COMPRESSED_FILTERED_TSI. + * + * The CasLoadMode is set to LENIENT if the leniently flag is true; otherwise it is set to DEFAULT. + * + * @param casInputStream - + * @param tsiInputStream - + * @param aCAS - + * @param leniently - + * @return - + * @throws IOException - + */ + public static SerialFormat load(InputStream casInputStream, InputStream tsiInputStream, CAS aCAS, boolean leniently) throws IOException { + return load(casInputStream, tsiInputStream, aCAS, leniently ? CasLoadMode.LENIENT : CasLoadMode.DEFAULT); + } + + /** + * Loads a CAS from an Input Stream. The format is determined from the content. + * For formats of ending in _TSI SERIALIZED_TSI or COMPRESSED_FILTERED_TSI, + * the type system and index definitions are read from the cas input source; + * the value of tsiInputStream is ignored. + * + * For other formats, if the tsiInputStream is not null, + * type system and index definitions are read from that source. + * + * If TSI information is available, the CAS's type system and indexes definition are replaced, + * except for SerialFormats COMPRESSED_FILTERED, COMPRESSED_FILTERED_TS, and COMPRESSED_FILTERED_TSI. + * + * If the CasLoadMode == REINIT, then the TSI information is also used for these 3 formats to replace the CAS's definitions. + * + * @param casInputStream + * The input stream containing the CAS, appropriately buffered. + * @param tsiInputStream + * The optional input stream containing the type system, appropriately buffered. + * This is only used if it is non null and + * - the casInputStream does not already come with an embedded CAS Type System and Index Definition, or + * - the serial format is COMPRESSED_FILTERED_TSI + * @param aCAS + * The CAS that should be filled + * @param casLoadMode specifies loading alternative like lenient and reinit, see CasLoadMode. + * @return the SerialFormat of the loaded CAS + * @throws IOException + * - Problem loading from given InputStream + */ + public static SerialFormat load(InputStream casInputStream, InputStream tsiInputStream, CAS aCAS, + CasLoadMode casLoadMode) throws IOException { + return load(casInputStream, tsiInputStream, aCAS, casLoadMode, null); + } - ++ + /** + * This load variant can be used for loading Form 6 compressed CASes where the + * type system to use to deserialize is provided as an argument. It can also load other formats, + * where its behavior is identical to load(casInputStream, aCas). + * + * Loads a CAS from an Input Stream. The format is determined from the content. + * For SerialFormats of ending in _TSI SERIALIZED_TSI or COMPRESSED_FILTERED_TSI, + * the type system and index definitions are read from the cas input source; + * the value of typeSystem is ignored. + * + * For COMPRESSED_FILTERED_xxx formats, if the typeSystem is not null, + * the typeSystem is used for decoding. + * + * If embedded TSI information is available, the CAS's type system and indexes definition are replaced, + * except for SerialFormats COMPRESSED_FILTERED, COMPRESSED_FILTERED_TS, and COMPRESSED_FILTERED_TSI. + * + * To replace the CAS's type system and indexes definition for these, use a load form which + * has the CasLoadMode argument, and set this to REINIT. + * + * @param casInputStream + * The input stream containing the CAS, appropriately buffered. + * @param aCAS + * The CAS that should be filled + * @param typeSystem the type system to use for decoding the serialized form, must be non-null + * @return the SerialFormat of the loaded CAS + * @throws IOException Problem loading from given InputStream + */ + public static SerialFormat load(InputStream casInputStream, CAS aCAS, TypeSystem typeSystem) throws IOException { + return load(casInputStream, null, aCAS, CasLoadMode.DEFAULT, (TypeSystemImpl) typeSystem); + } + + private static SerialFormat load(InputStream casInputStream, InputStream tsiInputStream, CAS aCAS, + CasLoadMode casLoadMode, TypeSystemImpl typeSystem) throws IOException { - ++ + if (!casInputStream.markSupported()) { + casInputStream = new BufferedInputStream(casInputStream); + } + + CASImpl casImpl = (CASImpl) aCAS; ++ BinaryCasSerDes bcsd = casImpl.getBinaryCasSerDes(); + + // scan the first part of the file for known formats + casInputStream.mark(6); + byte[] firstPartOfFile = new byte[6]; + int bytesReadCount = casInputStream.read(firstPartOfFile); + casInputStream.reset(); - String start = new String(firstPartOfFile, 0, bytesReadCount, "UTF-8").toLowerCase(); ++ String start = new String(firstPartOfFile, 0, bytesReadCount, StandardCharsets.UTF_8).toLowerCase(); + + if (start.startsWith("<?xml ")) { // could be XCAS or XMI + try { - casImpl.setupCasFromCasMgrSerializer(readCasManager(tsiInputStream)); ++ bcsd.setupCasFromCasMgrSerializer(readCasManager(tsiInputStream)); + // next call decides on XMI or XCAS via content + return XmlCasDeserializer.deserializeR(casInputStream, aCAS, casLoadMode == CasLoadMode.LENIENT); + } catch (SAXException e) { + throw new UIMARuntimeException(e); + } + } + + // Not an XML file, decode as binary file + DataInputStream deserIn = CommonSerDes.maybeWrapToDataInputStream(casInputStream); + if (CommonSerDes.isBinaryHeader(deserIn)) { + + /******************************************* + * Binary, Compressed Binary (form 4 or 6) + ******************************************/ + Header h = CommonSerDes.readHeader(deserIn); - return casImpl.reinit(h, casInputStream, readCasManager(tsiInputStream), casLoadMode, null, AllowPreexistingFS.allow, typeSystem); ++ return bcsd.reinit(h, casInputStream, readCasManager(tsiInputStream), casLoadMode, null, AllowPreexistingFS.allow, typeSystem); + + } else { + + /****************************** + * Java Object loading + ******************************/ + ObjectInputStream ois = new ObjectInputStream(casInputStream); + try { + Object o = ois.readObject(); + if (o instanceof CASSerializer) { - casImpl.setupCasFromCasMgrSerializer(readCasManager(tsiInputStream)); - casImpl.reinit((CASSerializer) o); // deserialize from object ++ bcsd.setupCasFromCasMgrSerializer(readCasManager(tsiInputStream)); ++ bcsd.reinit((CASSerializer) o); // deserialize from object + return SerialFormat.SERIALIZED; + } else if (o instanceof CASCompleteSerializer) { + // with a type system use that, ignore any supplied via tsiInputStream - casImpl.reinit((CASCompleteSerializer) o); ++ bcsd.reinit((CASCompleteSerializer) o); + return SerialFormat.SERIALIZED_TSI; + } else { + /**Unrecognized serialized CAS format*/ + throw new CASRuntimeException(CASRuntimeException.UNRECOGNIZED_SERIALIZED_CAS_FORMAT); + } + } catch (ClassNotFoundException e) { + /**Unrecognized serialized CAS format*/ + throw new CASRuntimeException(CASRuntimeException.UNRECOGNIZED_SERIALIZED_CAS_FORMAT); + } + } + } + + /** + * Write the CAS in the specified format. + * + * @param aCas + * The CAS that should be serialized and stored + * @param docOS + * The output stream for the CAS + * @param format + * The SerialFormat in which the CAS should be stored. + * @throws IOException + * - Problem saving to the given InputStream + */ + public static void save(CAS aCas, OutputStream docOS, SerialFormat format) throws IOException { + save(aCas, docOS, null, format); + } + + /** + * Write the CAS in the specified format. If the format does not include typesystem information + * and the optional output stream of the typesystem is specified, then the typesystem information + * is written there. + * + * @param aCas + * The CAS that should be serialized and stored + * @param docOS + * The output stream for the CAS, with appropriate buffering + * @param tsiOS + * Optional output stream for type system information. Only used if the format does not + * support storing typesystem information directly in the main output file. + * @param format + * The SerialFormat in which the CAS should be stored. + * @throws IOException + * - Problem saving to the given InputStream + */ + public static void save(CAS aCas, OutputStream docOS, OutputStream tsiOS, SerialFormat format) + throws IOException { + boolean typeSystemWritten = false; + try { + switch (format) { + case XMI: + XmiCasSerializer.serialize(aCas, docOS); + break; + case XMI_1_1: + XmiCasSerializer.serialize(aCas, null, docOS, false, null, null, true); + break; + case XCAS: + XCASSerializer.serialize(aCas, docOS, true); // true = formatted output + break; ++ case XCAS_1_1: ++ XCASSerializer.serialize(aCas, docOS, true, true); // true = formatted output, use xml 1.1 ++ break; + case SERIALIZED: + writeJavaObject(Serialization.serializeCAS(aCas), docOS); + break; + case SERIALIZED_TSI: + writeJavaObject(Serialization.serializeCASComplete((CASMgr) aCas), docOS); + typeSystemWritten = true; // Embedded type system + break; + case BINARY: // Java-serialized CAS without type system + serializeCAS(aCas, docOS); + break; + case BINARY_TSI: // Java-serialized CAS without type system + CASSerializer ser = new CASSerializer(); + ser.addCAS((CASImpl) aCas, docOS, true); + break; + case COMPRESSED: // Binary compressed CAS without type system (form 4) + serializeWithCompression(aCas, docOS); + break; + case COMPRESSED_TSI: // Binary compressed CAS without type system (form 4) + new BinaryCasSerDes4((TypeSystemImpl)aCas.getTypeSystem(), false).serializeWithTsi((CASImpl) aCas, docOS); + break; + case COMPRESSED_FILTERED: // Binary compressed CAS (form 6) + serializeWithCompression(aCas, docOS, false, false); + break; + case COMPRESSED_FILTERED_TS: + serializeWithCompression(aCas, docOS, true, false); + typeSystemWritten = true; // Embedded type system + break; + case COMPRESSED_FILTERED_TSI: + serializeWithCompression(aCas, docOS, false, true); + typeSystemWritten = true; // Embedded type system + break; + default: + StringBuilder sb = new StringBuilder(); + for (SerialFormat sf : SerialFormat.values()) { + sb = sb.append(sf.toString()).append(", "); + } + throw new IllegalArgumentException("Unknown format [" + format.name() + + "]. Must be one of: " + sb.toString()); + } + } catch (IOException e) { + throw e; + } catch (Exception e) { + throw new IOException(e); + } + + // Write type system to the separate stream only if it has not already been embedded into the + // main stream + if (tsiOS != null && !typeSystemWritten) { + writeTypeSystem(aCas, tsiOS, true); + } + } + + private static CASMgrSerializer readCasManager(InputStream tsiInputStream) throws IOException { + try { + if (null == tsiInputStream) { + return null; + } + ObjectInputStream is = new ObjectInputStream(tsiInputStream); + return (CASMgrSerializer) is.readObject(); + } catch (ClassNotFoundException e) { + throw new IOException(e); + } + } + + private static void writeJavaObject(Object o, OutputStream aOS) throws IOException { + ObjectOutputStream tsiOS = new ObjectOutputStream(aOS); + tsiOS.writeObject(o); + tsiOS.flush(); + } + + public static void writeTypeSystem(CAS aCas, OutputStream aOS, boolean includeIndexDefs) throws IOException { + writeJavaObject(includeIndexDefs + ? Serialization.serializeCASMgr((CASImpl) aCas) + : Serialization.serializeCASMgrTypeSystemOnly((CASImpl) aCas) + , aOS); + } + + private static void closeQuitely(Closeable closeable) { + if (closeable != null) { + try { + closeable.close(); + } catch (IOException e) { + // do nothing + } + } + } + -} ++} diff --cc uimaj-core/src/test/java/org/apache/uima/cas/impl/XmiCasSerializerTest.java index 3209f62,fa4d491..0118ff0 --- a/uimaj-core/src/test/java/org/apache/uima/cas/impl/XmiCasSerializerTest.java +++ b/uimaj-core/src/test/java/org/apache/uima/cas/impl/XmiCasSerializerTest.java @@@ -27,8 -27,11 +27,9 @@@ import javax.xml.parsers.ParserConfigur import javax.xml.parsers.SAXParserFactory; import javax.xml.transform.OutputKeys; -import junit.framework.TestCase; - import org.apache.uima.UIMAFramework; import org.apache.uima.cas.CAS; + import org.apache.uima.cas.SerialFormat; import org.apache.uima.resource.metadata.TypeSystemDescription; import org.apache.uima.test.junit_extension.JUnitExtension; import org.apache.uima.util.CasCreationUtils;