Modified: tika/trunk/tika-batch/src/test/resources/log4j.properties URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/log4j.properties?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-batch/src/test/resources/log4j.properties (original) +++ tika/trunk/tika-batch/src/test/resources/log4j.properties Wed May 13 13:49:36 2015 @@ -1,22 +1,22 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -log4j.rootLogger=OFF - -#for debugging -#log4j.rootLogger=TRACE,A1 +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +log4j.rootLogger=OFF + +#for debugging +#log4j.rootLogger=TRACE,A1 log4j.appender.A1=org.apache.log4j.ConsoleAppender
Modified: tika/trunk/tika-batch/src/test/resources/log4j_process.properties URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/log4j_process.properties?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-batch/src/test/resources/log4j_process.properties (original) +++ tika/trunk/tika-batch/src/test/resources/log4j_process.properties Wed May 13 13:49:36 2015 @@ -1,24 +1,24 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#This is used by the batch process; see log4j.properties for the driver - -log4j.rootLogger=OFF - -#for debugging -#log4j.rootLogger=TRACE,A1 +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#This is used by the batch process; see log4j.properties for the driver + +log4j.rootLogger=OFF + +#for debugging +#log4j.rootLogger=TRACE,A1 log4j.appender.A1=org.apache.log4j.ConsoleAppender Modified: tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml (original) +++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml Wed May 13 13:49:36 2015 @@ -103,10 +103,10 @@ <outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/> - </consumers> - - <!-- reporter and interrupter are optional --> - <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000" - reporterStaleThresholdMillis="500000"/> - <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> + </consumers> + + <!-- reporter and interrupter are optional --> + <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000" + reporterStaleThresholdMillis="500000"/> + <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> </tika-batch-config> \ No newline at end of file Modified: tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml (original) +++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml Wed May 13 13:49:36 2015 @@ -96,10 +96,10 @@ <outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/> - </consumers> - - <!-- reporter and interrupter are optional --> - <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000" - reporterStaleThresholdMillis="500000"/> - <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> + </consumers> + + <!-- reporter and interrupter are optional --> + <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000" + reporterStaleThresholdMillis="500000"/> + <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> </tika-batch-config> \ No newline at end of file Modified: tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml (original) +++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml Wed May 13 13:49:36 2015 @@ -102,10 +102,10 @@ <outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/> - </consumers> - - <!-- reporter and interrupter are optional --> - <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000" - reporterStaleThresholdMillis="500000"/> - <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> + </consumers> + + <!-- reporter and interrupter are optional --> + <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000" + reporterStaleThresholdMillis="500000"/> + <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> </tika-batch-config> \ No newline at end of file Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java Wed May 13 13:49:36 2015 @@ -1,40 +1,40 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.exception; - -/** - * Exception to be thrown when a document does not allow content extraction. - * As of this writing, PDF documents are the only type of document that might - * cause this type of exception. - */ -public class AccessPermissionException extends TikaException { - public AccessPermissionException() { - super("Unable to process: content extraction is not allowed"); - } - - public AccessPermissionException(Throwable th) { - super("Unable to process: content extraction is not allowed", th); - } - - public AccessPermissionException(String info) { - super(info); - } - - public AccessPermissionException(String info, Throwable th) { - super(info, th); - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.exception; + +/** + * Exception to be thrown when a document does not allow content extraction. + * As of this writing, PDF documents are the only type of document that might + * cause this type of exception. + */ +public class AccessPermissionException extends TikaException { + public AccessPermissionException() { + super("Unable to process: content extraction is not allowed"); + } + + public AccessPermissionException(Throwable th) { + super("Unable to process: content extraction is not allowed", th); + } + + public AccessPermissionException(String info) { + super(info); + } + + public AccessPermissionException(String info, Throwable th) { + super(info, th); + } +} Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java Wed May 13 13:49:36 2015 @@ -1,71 +1,71 @@ -package org.apache.tika.metadata; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Until we can find a common standard, we'll use these options. They - * were mostly derived from PDFBox's AccessPermission, but some can - * apply to other document formats, especially CAN_MODIFY and FILL_IN_FORM. - */ -public interface AccessPermissions { - - final static String PREFIX = "access_permission"+Metadata.NAMESPACE_PREFIX_DELIMITER; - - /** - * Can any modifications be made to the document - */ - Property CAN_MODIFY = Property.externalTextBag(PREFIX+"can_modify"); - - /** - * Should content be extracted, generally. - */ - Property EXTRACT_CONTENT = Property.externalText(PREFIX+"extract_content"); - - /** - * Should content be extracted for the purposes - * of accessibility. - */ - Property EXTRACT_FOR_ACCESSIBILITY = Property.externalText(PREFIX + "extract_for_accessibility"); - - /** - * Can the user insert/rotate/delete pages. - */ - Property ASSEMBLE_DOCUMENT = Property.externalText(PREFIX+"assemble_document"); - - - /** - * Can the user fill in a form - */ - Property FILL_IN_FORM = Property.externalText(PREFIX+"fill_in_form"); - - /** - * Can the user modify annotations - */ - Property CAN_MODIFY_ANNOTATIONS = Property.externalText(PREFIX+"modify_annotations"); - - /** - * Can the user print the document - */ - Property CAN_PRINT = Property.externalText(PREFIX+"can_print"); - - /** - * Can the user print an image-degraded version of the document. - */ - Property CAN_PRINT_DEGRADED = Property.externalText(PREFIX+"can_print_degraded"); - -} +package org.apache.tika.metadata; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Until we can find a common standard, we'll use these options. They + * were mostly derived from PDFBox's AccessPermission, but some can + * apply to other document formats, especially CAN_MODIFY and FILL_IN_FORM. + */ +public interface AccessPermissions { + + final static String PREFIX = "access_permission"+Metadata.NAMESPACE_PREFIX_DELIMITER; + + /** + * Can any modifications be made to the document + */ + Property CAN_MODIFY = Property.externalTextBag(PREFIX+"can_modify"); + + /** + * Should content be extracted, generally. + */ + Property EXTRACT_CONTENT = Property.externalText(PREFIX+"extract_content"); + + /** + * Should content be extracted for the purposes + * of accessibility. + */ + Property EXTRACT_FOR_ACCESSIBILITY = Property.externalText(PREFIX + "extract_for_accessibility"); + + /** + * Can the user insert/rotate/delete pages. + */ + Property ASSEMBLE_DOCUMENT = Property.externalText(PREFIX+"assemble_document"); + + + /** + * Can the user fill in a form + */ + Property FILL_IN_FORM = Property.externalText(PREFIX+"fill_in_form"); + + /** + * Can the user modify annotations + */ + Property CAN_MODIFY_ANNOTATIONS = Property.externalText(PREFIX+"modify_annotations"); + + /** + * Can the user print the document + */ + Property CAN_PRINT = Property.externalText(PREFIX+"can_print"); + + /** + * Can the user print an image-degraded version of the document. + */ + Property CAN_PRINT_DEGRADED = Property.externalText(PREFIX+"can_print_degraded"); + +} Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java Wed May 13 13:49:36 2015 @@ -1,70 +1,70 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.sax; - -import java.io.OutputStream; -import java.io.UnsupportedEncodingException; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; - -import org.xml.sax.SAXException; - -/** - * SAX event handler that serializes the HTML document to a character stream. - * The incoming SAX events are expected to be well-formed (properly nested, - * etc.) and valid HTML. - * - * @since Apache Tika 0.10 - */ -public class ToHTMLContentHandler extends ToXMLContentHandler { - - private static final Set<String> EMPTY_ELEMENTS = - new HashSet<String>(Arrays.asList( - "area", "base", "basefont", "br", "col", "frame", "hr", - "img", "input", "isindex", "link", "meta", "param")); - - public ToHTMLContentHandler(OutputStream stream, String encoding) - throws UnsupportedEncodingException { - super(stream, encoding); - } - - public ToHTMLContentHandler() { - super(); - } - - @Override - public void startDocument() throws SAXException { - } - - @Override - public void endElement(String uri, String localName, String qName) - throws SAXException { - if (inStartElement) { - write('>'); - inStartElement = false; - - if (EMPTY_ELEMENTS.contains(localName)) { - namespaces.clear(); - return; - } - } - - super.endElement(uri, localName, qName); - } - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.sax; + +import java.io.OutputStream; +import java.io.UnsupportedEncodingException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import org.xml.sax.SAXException; + +/** + * SAX event handler that serializes the HTML document to a character stream. + * The incoming SAX events are expected to be well-formed (properly nested, + * etc.) and valid HTML. + * + * @since Apache Tika 0.10 + */ +public class ToHTMLContentHandler extends ToXMLContentHandler { + + private static final Set<String> EMPTY_ELEMENTS = + new HashSet<String>(Arrays.asList( + "area", "base", "basefont", "br", "col", "frame", "hr", + "img", "input", "isindex", "link", "meta", "param")); + + public ToHTMLContentHandler(OutputStream stream, String encoding) + throws UnsupportedEncodingException { + super(stream, encoding); + } + + public ToHTMLContentHandler() { + super(); + } + + @Override + public void startDocument() throws SAXException { + } + + @Override + public void endElement(String uri, String localName, String qName) + throws SAXException { + if (inStartElement) { + write('>'); + inStartElement = false; + + if (EMPTY_ELEMENTS.contains(localName)) { + namespaces.clear(); + return; + } + } + + super.endElement(uri, localName, qName); + } + +} Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java Wed May 13 13:49:36 2015 @@ -1,140 +1,140 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.sax; - -import java.io.IOException; -import java.io.OutputStream; -import java.io.OutputStreamWriter; -import java.io.StringWriter; -import java.io.UnsupportedEncodingException; -import java.io.Writer; -import java.nio.charset.Charset; - -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - -/** - * SAX event handler that writes all character content out to a character - * stream. No escaping or other transformations are made on the character - * content. - * - * @since Apache Tika 0.10 - */ -public class ToTextContentHandler extends DefaultHandler { - - /** - * The character stream. - */ - private final Writer writer; - - /** - * Creates a content handler that writes character events to - * the given writer. - * - * @param writer writer - */ - public ToTextContentHandler(Writer writer) { - this.writer = writer; - } - - /** - * Creates a content handler that writes character events to - * the given output stream using the platform default encoding. - * - * @param stream output stream - */ - public ToTextContentHandler(OutputStream stream) { - this(new OutputStreamWriter(stream, Charset.defaultCharset())); - } - - /** - * Creates a content handler that writes character events to - * the given output stream using the given encoding. - * - * @param stream output stream - * @param encoding output encoding - * @throws UnsupportedEncodingException if the encoding is unsupported - */ - public ToTextContentHandler(OutputStream stream, String encoding) - throws UnsupportedEncodingException { - this(new OutputStreamWriter(stream, encoding)); - } - - /** - * Creates a content handler that writes character events - * to an internal string buffer. Use the {@link #toString()} - * method to access the collected character content. - */ - public ToTextContentHandler() { - this(new StringWriter()); - } - - /** - * Writes the given characters to the given character stream. - */ - @Override - public void characters(char[] ch, int start, int length) - throws SAXException { - try { - writer.write(ch, start, length); - } catch (IOException e) { - throw new SAXException( - "Error writing: " + new String(ch, start, length), e); - } - } - - - /** - * Writes the given ignorable characters to the given character stream. - * The default implementation simply forwards the call to the - * {@link #characters(char[], int, int)} method. - */ - @Override - public void ignorableWhitespace(char[] ch, int start, int length) - throws SAXException { - characters(ch, start, length); - } - - /** - * Flushes the character stream so that no characters are forgotten - * in internal buffers. - * - * @see <a href="https://issues.apache.org/jira/browse/TIKA-179">TIKA-179</a> - * @throws SAXException if the stream can not be flushed - */ - @Override - public void endDocument() throws SAXException { - try { - writer.flush(); - } catch (IOException e) { - throw new SAXException("Error flushing character output", e); - } - } - - /** - * Returns the contents of the internal string buffer where - * all the received characters have been collected. Only works - * when this object was constructed using the empty default - * constructor or by passing a {@link StringWriter} to the - * other constructor. - */ - @Override - public String toString() { - return writer.toString(); - } - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.sax; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.StringWriter; +import java.io.UnsupportedEncodingException; +import java.io.Writer; +import java.nio.charset.Charset; + +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * SAX event handler that writes all character content out to a character + * stream. No escaping or other transformations are made on the character + * content. + * + * @since Apache Tika 0.10 + */ +public class ToTextContentHandler extends DefaultHandler { + + /** + * The character stream. + */ + private final Writer writer; + + /** + * Creates a content handler that writes character events to + * the given writer. + * + * @param writer writer + */ + public ToTextContentHandler(Writer writer) { + this.writer = writer; + } + + /** + * Creates a content handler that writes character events to + * the given output stream using the platform default encoding. + * + * @param stream output stream + */ + public ToTextContentHandler(OutputStream stream) { + this(new OutputStreamWriter(stream, Charset.defaultCharset())); + } + + /** + * Creates a content handler that writes character events to + * the given output stream using the given encoding. + * + * @param stream output stream + * @param encoding output encoding + * @throws UnsupportedEncodingException if the encoding is unsupported + */ + public ToTextContentHandler(OutputStream stream, String encoding) + throws UnsupportedEncodingException { + this(new OutputStreamWriter(stream, encoding)); + } + + /** + * Creates a content handler that writes character events + * to an internal string buffer. Use the {@link #toString()} + * method to access the collected character content. + */ + public ToTextContentHandler() { + this(new StringWriter()); + } + + /** + * Writes the given characters to the given character stream. + */ + @Override + public void characters(char[] ch, int start, int length) + throws SAXException { + try { + writer.write(ch, start, length); + } catch (IOException e) { + throw new SAXException( + "Error writing: " + new String(ch, start, length), e); + } + } + + + /** + * Writes the given ignorable characters to the given character stream. + * The default implementation simply forwards the call to the + * {@link #characters(char[], int, int)} method. + */ + @Override + public void ignorableWhitespace(char[] ch, int start, int length) + throws SAXException { + characters(ch, start, length); + } + + /** + * Flushes the character stream so that no characters are forgotten + * in internal buffers. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-179">TIKA-179</a> + * @throws SAXException if the stream can not be flushed + */ + @Override + public void endDocument() throws SAXException { + try { + writer.flush(); + } catch (IOException e) { + throw new SAXException("Error flushing character output", e); + } + } + + /** + * Returns the contents of the internal string buffer where + * all the received characters have been collected. Only works + * when this object was constructed using the empty default + * constructor or by passing a {@link StringWriter} to the + * other constructor. + */ + @Override + public String toString() { + return writer.toString(); + } + +} Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java Wed May 13 13:49:36 2015 @@ -1,281 +1,281 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.sax; - -import java.io.OutputStream; -import java.io.UnsupportedEncodingException; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; - -/** - * SAX event handler that serializes the XML document to a character stream. - * The incoming SAX events are expected to be well-formed (properly nested, - * etc.) and to explicitly include namespace declaration attributes and - * corresponding namespace prefixes in element and attribute names. - * - * @since Apache Tika 0.10 - */ -public class ToXMLContentHandler extends ToTextContentHandler { - - private static class ElementInfo { - - private final ElementInfo parent; - - private final Map<String, String> namespaces; - - public ElementInfo(ElementInfo parent, Map<String, String> namespaces) { - this.parent = parent; - if (namespaces.isEmpty()) { - this.namespaces = Collections.emptyMap(); - } else { - this.namespaces = new HashMap<String, String>(namespaces); - } - } - - public String getPrefix(String uri) throws SAXException { - String prefix = namespaces.get(uri); - if (prefix != null) { - return prefix; - } else if (parent != null) { - return parent.getPrefix(uri); - } else if (uri == null || uri.length() == 0) { - return ""; - } else { - throw new SAXException("Namespace " + uri + " not declared"); - } - } - - public String getQName(String uri, String localName) - throws SAXException { - String prefix = getPrefix(uri); - if (prefix.length() > 0) { - return prefix + ":" + localName; - } else { - return localName; - } - } - - } - - private final String encoding; - - protected boolean inStartElement = false; - - protected final Map<String, String> namespaces = - new HashMap<String, String>(); - - private ElementInfo currentElement; - - /** - * Creates an XML serializer that writes to the given byte stream - * using the given character encoding. - * - * @param stream output stream - * @param encoding output encoding - * @throws UnsupportedEncodingException if the encoding is unsupported - */ - public ToXMLContentHandler(OutputStream stream, String encoding) - throws UnsupportedEncodingException { - super(stream, encoding); - this.encoding = encoding; - } - - public ToXMLContentHandler(String encoding) { - super(); - this.encoding = encoding; - } - - public ToXMLContentHandler() { - super(); - this.encoding = null; - } - - /** - * Writes the XML prefix. - */ - @Override - public void startDocument() throws SAXException { - if (encoding != null) { - write("<?xml version=\"1.0\" encoding=\""); - write(encoding); - write("\"?>\n"); - } - - currentElement = null; - namespaces.clear(); - } - - @Override - public void startPrefixMapping(String prefix, String uri) - throws SAXException { - try { - if (currentElement != null - && prefix.equals(currentElement.getPrefix(uri))) { - return; - } - } catch (SAXException ignore) { - } - namespaces.put(uri, prefix); - } - - @Override - public void startElement( - String uri, String localName, String qName, Attributes atts) - throws SAXException { - lazyCloseStartElement(); - - currentElement = new ElementInfo(currentElement, namespaces); - - write('<'); - write(currentElement.getQName(uri, localName)); - - for (int i = 0; i < atts.getLength(); i++) { - write(' '); - write(currentElement.getQName(atts.getURI(i), atts.getLocalName(i))); - write('='); - write('"'); - char[] ch = atts.getValue(i).toCharArray(); - writeEscaped(ch, 0, ch.length, true); - write('"'); - } - - for (Map.Entry<String, String> entry : namespaces.entrySet()) { - write(' '); - write("xmlns"); - String prefix = entry.getValue(); - if (prefix.length() > 0) { - write(':'); - write(prefix); - } - write('='); - write('"'); - char[] ch = entry.getKey().toCharArray(); - writeEscaped(ch, 0, ch.length, true); - write('"'); - } - namespaces.clear(); - - inStartElement = true; - } - - @Override - public void endElement(String uri, String localName, String qName) - throws SAXException { - if (inStartElement) { - write(" />"); - inStartElement = false; - } else { - write("</"); - write(qName); - write('>'); - } - - namespaces.clear(); - - // Reset the position in the tree, to avoid endless stack overflow - // chains (see TIKA-1070) - currentElement = currentElement.parent; - } - - @Override - public void characters(char[] ch, int start, int length) - throws SAXException { - lazyCloseStartElement(); - writeEscaped(ch, start, start + length, false); - } - - private void lazyCloseStartElement() throws SAXException { - if (inStartElement) { - write('>'); - inStartElement = false; - } - } - - /** - * Writes the given character as-is. - * - * @param ch character to be written - * @throws SAXException if the character could not be written - */ - protected void write(char ch) throws SAXException { - super.characters(new char[] { ch }, 0, 1); - } - - /** - * Writes the given string of character as-is. - * - * @param string string of character to be written - * @throws SAXException if the character string could not be written - */ - protected void write(String string) throws SAXException { - super.characters(string.toCharArray(), 0, string.length()); - } - - /** - * Writes the given characters as-is followed by the given entity. - * - * @param ch character array - * @param from start position in the array - * @param to end position in the array - * @param entity entity code - * @return next position in the array, - * after the characters plus one entity - * @throws SAXException if the characters could not be written - */ - private int writeCharsAndEntity(char[] ch, int from, int to, String entity) - throws SAXException { - super.characters(ch, from, to - from); - write('&'); - write(entity); - write(';'); - return to + 1; - } - - /** - * Writes the given characters with XML meta characters escaped. - * - * @param ch character array - * @param from start position in the array - * @param to end position in the array - * @param attribute whether the characters should be escaped as - * an attribute value or normal character content - * @throws SAXException if the characters could not be written - */ - private void writeEscaped(char[] ch, int from, int to, boolean attribute) - throws SAXException { - int pos = from; - while (pos < to) { - if (ch[pos] == '<') { - from = pos = writeCharsAndEntity(ch, from, pos, "lt"); - } else if (ch[pos] == '>') { - from = pos = writeCharsAndEntity(ch, from, pos, "gt"); - } else if (ch[pos] == '&') { - from = pos = writeCharsAndEntity(ch, from, pos, "amp"); - } else if (attribute && ch[pos] == '"') { - from = pos = writeCharsAndEntity(ch, from, pos, "quot"); - } else { - pos++; - } - } - super.characters(ch, from, to - from); - } - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.sax; + +import java.io.OutputStream; +import java.io.UnsupportedEncodingException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +/** + * SAX event handler that serializes the XML document to a character stream. + * The incoming SAX events are expected to be well-formed (properly nested, + * etc.) and to explicitly include namespace declaration attributes and + * corresponding namespace prefixes in element and attribute names. + * + * @since Apache Tika 0.10 + */ +public class ToXMLContentHandler extends ToTextContentHandler { + + private static class ElementInfo { + + private final ElementInfo parent; + + private final Map<String, String> namespaces; + + public ElementInfo(ElementInfo parent, Map<String, String> namespaces) { + this.parent = parent; + if (namespaces.isEmpty()) { + this.namespaces = Collections.emptyMap(); + } else { + this.namespaces = new HashMap<String, String>(namespaces); + } + } + + public String getPrefix(String uri) throws SAXException { + String prefix = namespaces.get(uri); + if (prefix != null) { + return prefix; + } else if (parent != null) { + return parent.getPrefix(uri); + } else if (uri == null || uri.length() == 0) { + return ""; + } else { + throw new SAXException("Namespace " + uri + " not declared"); + } + } + + public String getQName(String uri, String localName) + throws SAXException { + String prefix = getPrefix(uri); + if (prefix.length() > 0) { + return prefix + ":" + localName; + } else { + return localName; + } + } + + } + + private final String encoding; + + protected boolean inStartElement = false; + + protected final Map<String, String> namespaces = + new HashMap<String, String>(); + + private ElementInfo currentElement; + + /** + * Creates an XML serializer that writes to the given byte stream + * using the given character encoding. + * + * @param stream output stream + * @param encoding output encoding + * @throws UnsupportedEncodingException if the encoding is unsupported + */ + public ToXMLContentHandler(OutputStream stream, String encoding) + throws UnsupportedEncodingException { + super(stream, encoding); + this.encoding = encoding; + } + + public ToXMLContentHandler(String encoding) { + super(); + this.encoding = encoding; + } + + public ToXMLContentHandler() { + super(); + this.encoding = null; + } + + /** + * Writes the XML prefix. + */ + @Override + public void startDocument() throws SAXException { + if (encoding != null) { + write("<?xml version=\"1.0\" encoding=\""); + write(encoding); + write("\"?>\n"); + } + + currentElement = null; + namespaces.clear(); + } + + @Override + public void startPrefixMapping(String prefix, String uri) + throws SAXException { + try { + if (currentElement != null + && prefix.equals(currentElement.getPrefix(uri))) { + return; + } + } catch (SAXException ignore) { + } + namespaces.put(uri, prefix); + } + + @Override + public void startElement( + String uri, String localName, String qName, Attributes atts) + throws SAXException { + lazyCloseStartElement(); + + currentElement = new ElementInfo(currentElement, namespaces); + + write('<'); + write(currentElement.getQName(uri, localName)); + + for (int i = 0; i < atts.getLength(); i++) { + write(' '); + write(currentElement.getQName(atts.getURI(i), atts.getLocalName(i))); + write('='); + write('"'); + char[] ch = atts.getValue(i).toCharArray(); + writeEscaped(ch, 0, ch.length, true); + write('"'); + } + + for (Map.Entry<String, String> entry : namespaces.entrySet()) { + write(' '); + write("xmlns"); + String prefix = entry.getValue(); + if (prefix.length() > 0) { + write(':'); + write(prefix); + } + write('='); + write('"'); + char[] ch = entry.getKey().toCharArray(); + writeEscaped(ch, 0, ch.length, true); + write('"'); + } + namespaces.clear(); + + inStartElement = true; + } + + @Override + public void endElement(String uri, String localName, String qName) + throws SAXException { + if (inStartElement) { + write(" />"); + inStartElement = false; + } else { + write("</"); + write(qName); + write('>'); + } + + namespaces.clear(); + + // Reset the position in the tree, to avoid endless stack overflow + // chains (see TIKA-1070) + currentElement = currentElement.parent; + } + + @Override + public void characters(char[] ch, int start, int length) + throws SAXException { + lazyCloseStartElement(); + writeEscaped(ch, start, start + length, false); + } + + private void lazyCloseStartElement() throws SAXException { + if (inStartElement) { + write('>'); + inStartElement = false; + } + } + + /** + * Writes the given character as-is. + * + * @param ch character to be written + * @throws SAXException if the character could not be written + */ + protected void write(char ch) throws SAXException { + super.characters(new char[] { ch }, 0, 1); + } + + /** + * Writes the given string of character as-is. + * + * @param string string of character to be written + * @throws SAXException if the character string could not be written + */ + protected void write(String string) throws SAXException { + super.characters(string.toCharArray(), 0, string.length()); + } + + /** + * Writes the given characters as-is followed by the given entity. + * + * @param ch character array + * @param from start position in the array + * @param to end position in the array + * @param entity entity code + * @return next position in the array, + * after the characters plus one entity + * @throws SAXException if the characters could not be written + */ + private int writeCharsAndEntity(char[] ch, int from, int to, String entity) + throws SAXException { + super.characters(ch, from, to - from); + write('&'); + write(entity); + write(';'); + return to + 1; + } + + /** + * Writes the given characters with XML meta characters escaped. + * + * @param ch character array + * @param from start position in the array + * @param to end position in the array + * @param attribute whether the characters should be escaped as + * an attribute value or normal character content + * @throws SAXException if the characters could not be written + */ + private void writeEscaped(char[] ch, int from, int to, boolean attribute) + throws SAXException { + int pos = from; + while (pos < to) { + if (ch[pos] == '<') { + from = pos = writeCharsAndEntity(ch, from, pos, "lt"); + } else if (ch[pos] == '>') { + from = pos = writeCharsAndEntity(ch, from, pos, "gt"); + } else if (ch[pos] == '&') { + from = pos = writeCharsAndEntity(ch, from, pos, "amp"); + } else if (attribute && ch[pos] == '"') { + from = pos = writeCharsAndEntity(ch, from, pos, "quot"); + } else { + pos++; + } + } + super.characters(ch, from, to - from); + } + +} Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java (original) +++ tika/trunk/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java Wed May 13 13:49:36 2015 @@ -1,301 +1,301 @@ -package org.apache.tika.parser.mock; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; -import java.io.IOException; -import java.io.InputStream; -import java.lang.reflect.Constructor; -import java.util.ArrayList; -import java.util.Date; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AbstractParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.XHTMLContentHandler; -import org.w3c.dom.Document; -import org.w3c.dom.NamedNodeMap; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - -/** - * This class enables mocking of parser behavior for use in testing - * wrappers and drivers of parsers. - * <p> - * See resources/test-documents/mock/example.xml in tika-parsers/test for the documentation - * of all the options for this MockParser. - * <p> - * Tests for this class are in tika-parsers. - * <p> - * See also {@link org.apache.tika.parser.DummyParser} for another option. - */ - -public class MockParser extends AbstractParser { - - private static final long serialVersionUID = 1L; - - @Override - public Set<MediaType> getSupportedTypes(ParseContext context) { - Set<MediaType> types = new HashSet<MediaType>(); - MediaType type = MediaType.application("mock+xml"); - types.add(type); - return types; - } - - @Override - public void parse(InputStream stream, ContentHandler handler, - Metadata metadata, ParseContext context) throws IOException, - SAXException, TikaException { - Document doc = null; - DocumentBuilderFactory fact = DocumentBuilderFactory.newInstance(); - DocumentBuilder docBuilder = null; - try { - docBuilder = fact.newDocumentBuilder(); - doc = docBuilder.parse(stream); - } catch (ParserConfigurationException e) { - throw new IOException(e); - } catch (SAXException e) { - throw new IOException(e); - } - Node root = doc.getDocumentElement(); - NodeList actions = root.getChildNodes(); - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); - xhtml.startDocument(); - for (int i = 0; i < actions.getLength(); i++) { - executeAction(actions.item(i), metadata, xhtml); - } - xhtml.endDocument(); - } - - private void executeAction(Node action, Metadata metadata, XHTMLContentHandler xhtml) throws SAXException, - IOException, TikaException { - - if (action.getNodeType() != 1) { - return; - } - - String name = action.getNodeName(); - if ("metadata".equals(name)) { - metadata(action, metadata); - } else if("write".equals(name)) { - write(action, xhtml); - } else if ("throw".equals(name)) { - throwIt(action); - } else if ("hang".equals(name)) { - hang(action); - } else if ("oom".equals(name)) { - kabOOM(); - } else if ("print_out".equals(name) || "print_err".equals(name)){ - print(action, name); - } else { - throw new IllegalArgumentException("Didn't recognize mock action: "+name); - } - } - - private void print(Node action, String name) { - String content = action.getTextContent(); - if ("print_out".equals(name)) { - System.out.println(content); - } else if ("print_err".equals(name)) { - System.err.println(content); - } else { - throw new IllegalArgumentException("must be print_out or print_err"); - } - } - private void hang(Node action) { - boolean interruptible = true; - boolean heavy = false; - long millis = -1; - long pulseMillis = -1; - NamedNodeMap attrs = action.getAttributes(); - Node iNode = attrs.getNamedItem("interruptible"); - if (iNode != null) { - interruptible = ("true".equals(iNode.getNodeValue())); - } - Node hNode = attrs.getNamedItem("heavy"); - if (hNode != null) { - heavy = ("true".equals(hNode.getNodeValue())); - } - - Node mNode = attrs.getNamedItem("millis"); - if (mNode == null) { - throw new RuntimeException("Must specify \"millis\" attribute for hang."); - } - String millisString = mNode.getNodeValue(); - try { - millis = Long.parseLong(millisString); - } catch (NumberFormatException e) { - throw new RuntimeException("Value for \"millis\" attribute must be a long."); - } - - if (heavy) { - Node pNode = attrs.getNamedItem("pulse_millis"); - if (pNode == null) { - throw new RuntimeException("Must specify attribute \"pulse_millis\" if the hang is \"heavy\""); - } - String pulseMillisString = mNode.getNodeValue(); - try { - pulseMillis = Long.parseLong(pulseMillisString); - } catch (NumberFormatException e) { - throw new RuntimeException("Value for \"millis\" attribute must be a long."); - } - } - if (heavy) { - hangHeavy(millis, pulseMillis, interruptible); - } else { - sleep(millis, interruptible); - } - } - - private void throwIt(Node action) throws IOException, - SAXException, TikaException { - NamedNodeMap attrs = action.getAttributes(); - String className = attrs.getNamedItem("class").getNodeValue(); - String msg = action.getTextContent(); - throwIt(className, msg); - } - - private void metadata(Node action, Metadata metadata) { - NamedNodeMap attrs = action.getAttributes(); - //throws npe unless there is a name - String name = attrs.getNamedItem("name").getNodeValue(); - String value = action.getTextContent(); - Node actionType = attrs.getNamedItem("action"); - if (actionType == null) { - metadata.add(name, value); - } else { - if ("set".equals(actionType.getNodeValue())) { - metadata.set(name, value); - } else { - metadata.add(name, value); - } - } - } - - private void write(Node action, XHTMLContentHandler xhtml) throws SAXException { - NamedNodeMap attrs = action.getAttributes(); - Node eNode = attrs.getNamedItem("element"); - String elementType = "p"; - if (eNode != null) { - elementType = eNode.getTextContent(); - } - String text = action.getTextContent(); - xhtml.startElement(elementType); - xhtml.characters(text); - xhtml.endElement(elementType); - } - - - private void throwIt(String className, String msg) throws IOException, - SAXException, TikaException { - Throwable t = null; - if (msg == null || msg.equals("")) { - try { - t = (Throwable) Class.forName(className).newInstance(); - } catch (Exception e) { - throw new RuntimeException("couldn't create throwable class:"+className, e); - } - } else { - try { - Class<?> clazz = Class.forName(className); - Constructor<?> con = clazz.getConstructor(String.class); - t = (Throwable) con.newInstance(msg); - } catch (Exception e) { - throw new RuntimeException("couldn't create throwable class:" + className, e); - } - } - if (t instanceof SAXException) { - throw (SAXException)t; - } else if (t instanceof IOException) { - throw (IOException) t; - } else if (t instanceof TikaException) { - throw (TikaException) t; - } else if (t instanceof Error) { - throw (Error) t; - } else if (t instanceof RuntimeException) { - throw (RuntimeException) t; - } else { - //wrap the throwable in a RuntimeException - throw new RuntimeException(t); - } - } - - private void kabOOM() { - List<int[]> ints = new ArrayList<int[]>(); - - while (true) { - int[] intArr = new int[32000]; - ints.add(intArr); - } - } - - private void hangHeavy(long maxMillis, long pulseCheckMillis, boolean interruptible) { - //do some heavy computation and occasionally check for - //whether time has exceeded maxMillis (see TIKA-1132 for inspiration) - //or whether the thread was interrupted - long start = new Date().getTime(); - int lastChecked = 0; - while (true) { - for (int i = 1; i < Integer.MAX_VALUE; i++) { - for (int j = 1; j < Integer.MAX_VALUE; j++) { - double div = (double) i / (double) j; - lastChecked++; - if (lastChecked > pulseCheckMillis) { - lastChecked = 0; - if (interruptible && Thread.currentThread().isInterrupted()) { - return; - } - long elapsed = new Date().getTime()-start; - if (elapsed > maxMillis) { - return; - } - } - } - } - } - } - - private void sleep(long maxMillis, boolean isInterruptible) { - long start = new Date().getTime(); - long millisRemaining = maxMillis; - while (true) { - try { - Thread.sleep(millisRemaining); - } catch (InterruptedException e) { - if (isInterruptible) { - return; - } - } - long elapsed = new Date().getTime()-start; - millisRemaining = maxMillis - elapsed; - if (millisRemaining <= 0) { - break; - } - } - } +package org.apache.tika.parser.mock; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import java.io.IOException; +import java.io.InputStream; +import java.lang.reflect.Constructor; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.w3c.dom.Document; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * This class enables mocking of parser behavior for use in testing + * wrappers and drivers of parsers. + * <p> + * See resources/test-documents/mock/example.xml in tika-parsers/test for the documentation + * of all the options for this MockParser. + * <p> + * Tests for this class are in tika-parsers. + * <p> + * See also {@link org.apache.tika.parser.DummyParser} for another option. + */ + +public class MockParser extends AbstractParser { + + private static final long serialVersionUID = 1L; + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + Set<MediaType> types = new HashSet<MediaType>(); + MediaType type = MediaType.application("mock+xml"); + types.add(type); + return types; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + Document doc = null; + DocumentBuilderFactory fact = DocumentBuilderFactory.newInstance(); + DocumentBuilder docBuilder = null; + try { + docBuilder = fact.newDocumentBuilder(); + doc = docBuilder.parse(stream); + } catch (ParserConfigurationException e) { + throw new IOException(e); + } catch (SAXException e) { + throw new IOException(e); + } + Node root = doc.getDocumentElement(); + NodeList actions = root.getChildNodes(); + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + for (int i = 0; i < actions.getLength(); i++) { + executeAction(actions.item(i), metadata, xhtml); + } + xhtml.endDocument(); + } + + private void executeAction(Node action, Metadata metadata, XHTMLContentHandler xhtml) throws SAXException, + IOException, TikaException { + + if (action.getNodeType() != 1) { + return; + } + + String name = action.getNodeName(); + if ("metadata".equals(name)) { + metadata(action, metadata); + } else if("write".equals(name)) { + write(action, xhtml); + } else if ("throw".equals(name)) { + throwIt(action); + } else if ("hang".equals(name)) { + hang(action); + } else if ("oom".equals(name)) { + kabOOM(); + } else if ("print_out".equals(name) || "print_err".equals(name)){ + print(action, name); + } else { + throw new IllegalArgumentException("Didn't recognize mock action: "+name); + } + } + + private void print(Node action, String name) { + String content = action.getTextContent(); + if ("print_out".equals(name)) { + System.out.println(content); + } else if ("print_err".equals(name)) { + System.err.println(content); + } else { + throw new IllegalArgumentException("must be print_out or print_err"); + } + } + private void hang(Node action) { + boolean interruptible = true; + boolean heavy = false; + long millis = -1; + long pulseMillis = -1; + NamedNodeMap attrs = action.getAttributes(); + Node iNode = attrs.getNamedItem("interruptible"); + if (iNode != null) { + interruptible = ("true".equals(iNode.getNodeValue())); + } + Node hNode = attrs.getNamedItem("heavy"); + if (hNode != null) { + heavy = ("true".equals(hNode.getNodeValue())); + } + + Node mNode = attrs.getNamedItem("millis"); + if (mNode == null) { + throw new RuntimeException("Must specify \"millis\" attribute for hang."); + } + String millisString = mNode.getNodeValue(); + try { + millis = Long.parseLong(millisString); + } catch (NumberFormatException e) { + throw new RuntimeException("Value for \"millis\" attribute must be a long."); + } + + if (heavy) { + Node pNode = attrs.getNamedItem("pulse_millis"); + if (pNode == null) { + throw new RuntimeException("Must specify attribute \"pulse_millis\" if the hang is \"heavy\""); + } + String pulseMillisString = mNode.getNodeValue(); + try { + pulseMillis = Long.parseLong(pulseMillisString); + } catch (NumberFormatException e) { + throw new RuntimeException("Value for \"millis\" attribute must be a long."); + } + } + if (heavy) { + hangHeavy(millis, pulseMillis, interruptible); + } else { + sleep(millis, interruptible); + } + } + + private void throwIt(Node action) throws IOException, + SAXException, TikaException { + NamedNodeMap attrs = action.getAttributes(); + String className = attrs.getNamedItem("class").getNodeValue(); + String msg = action.getTextContent(); + throwIt(className, msg); + } + + private void metadata(Node action, Metadata metadata) { + NamedNodeMap attrs = action.getAttributes(); + //throws npe unless there is a name + String name = attrs.getNamedItem("name").getNodeValue(); + String value = action.getTextContent(); + Node actionType = attrs.getNamedItem("action"); + if (actionType == null) { + metadata.add(name, value); + } else { + if ("set".equals(actionType.getNodeValue())) { + metadata.set(name, value); + } else { + metadata.add(name, value); + } + } + } + + private void write(Node action, XHTMLContentHandler xhtml) throws SAXException { + NamedNodeMap attrs = action.getAttributes(); + Node eNode = attrs.getNamedItem("element"); + String elementType = "p"; + if (eNode != null) { + elementType = eNode.getTextContent(); + } + String text = action.getTextContent(); + xhtml.startElement(elementType); + xhtml.characters(text); + xhtml.endElement(elementType); + } + + + private void throwIt(String className, String msg) throws IOException, + SAXException, TikaException { + Throwable t = null; + if (msg == null || msg.equals("")) { + try { + t = (Throwable) Class.forName(className).newInstance(); + } catch (Exception e) { + throw new RuntimeException("couldn't create throwable class:"+className, e); + } + } else { + try { + Class<?> clazz = Class.forName(className); + Constructor<?> con = clazz.getConstructor(String.class); + t = (Throwable) con.newInstance(msg); + } catch (Exception e) { + throw new RuntimeException("couldn't create throwable class:" + className, e); + } + } + if (t instanceof SAXException) { + throw (SAXException)t; + } else if (t instanceof IOException) { + throw (IOException) t; + } else if (t instanceof TikaException) { + throw (TikaException) t; + } else if (t instanceof Error) { + throw (Error) t; + } else if (t instanceof RuntimeException) { + throw (RuntimeException) t; + } else { + //wrap the throwable in a RuntimeException + throw new RuntimeException(t); + } + } + + private void kabOOM() { + List<int[]> ints = new ArrayList<int[]>(); + + while (true) { + int[] intArr = new int[32000]; + ints.add(intArr); + } + } + + private void hangHeavy(long maxMillis, long pulseCheckMillis, boolean interruptible) { + //do some heavy computation and occasionally check for + //whether time has exceeded maxMillis (see TIKA-1132 for inspiration) + //or whether the thread was interrupted + long start = new Date().getTime(); + int lastChecked = 0; + while (true) { + for (int i = 1; i < Integer.MAX_VALUE; i++) { + for (int j = 1; j < Integer.MAX_VALUE; j++) { + double div = (double) i / (double) j; + lastChecked++; + if (lastChecked > pulseCheckMillis) { + lastChecked = 0; + if (interruptible && Thread.currentThread().isInterrupted()) { + return; + } + long elapsed = new Date().getTime()-start; + if (elapsed > maxMillis) { + return; + } + } + } + } + } + } + + private void sleep(long maxMillis, boolean isInterruptible) { + long start = new Date().getTime(); + long millisRemaining = maxMillis; + while (true) { + try { + Thread.sleep(millisRemaining); + } catch (InterruptedException e) { + if (isInterruptible) { + return; + } + } + long elapsed = new Date().getTime()-start; + millisRemaining = maxMillis - elapsed; + if (millisRemaining <= 0) { + break; + } + } + } } \ No newline at end of file Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java (original) +++ tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java Wed May 13 13:49:36 2015 @@ -1,150 +1,150 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.sax; - -import static org.junit.Assert.assertEquals; -import org.junit.Test; -import org.xml.sax.ContentHandler; -import org.xml.sax.helpers.AttributesImpl; - -public class SerializerTest { - - @Test - public void testToTextContentHandler() throws Exception { - assertStartDocument("", new ToTextContentHandler()); - assertCharacters("content", new ToTextContentHandler()); - assertCharacterEscaping("<&\">", new ToTextContentHandler()); - assertIgnorableWhitespace(" \t\r\n", new ToTextContentHandler()); - assertEmptyElement("", new ToTextContentHandler()); - assertEmptyElementWithAttributes("", new ToTextContentHandler()); - assertEmptyElementWithAttributeEscaping("", new ToTextContentHandler()); - assertElement("content", new ToTextContentHandler()); - assertElementWithAttributes("content", new ToTextContentHandler()); - } - - @Test - public void testToXMLContentHandler() throws Exception { - assertStartDocument("", new ToXMLContentHandler()); - assertStartDocument( - "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n", - new ToXMLContentHandler("UTF-8")); - assertCharacters("content", new ToXMLContentHandler()); - assertCharacterEscaping("<&\">", new ToXMLContentHandler()); - assertIgnorableWhitespace(" \t\r\n", new ToXMLContentHandler()); - assertEmptyElement("<br />", new ToXMLContentHandler()); - assertEmptyElementWithAttributes( - "<meta name=\"foo\" value=\"bar\" />", - new ToXMLContentHandler()); - assertEmptyElementWithAttributeEscaping( - "<p class=\"<&">\" />", - new ToXMLContentHandler()); - assertElement("<p>content</p>", new ToXMLContentHandler()); - assertElementWithAttributes( - "<p class=\"test\">content</p>", - new ToXMLContentHandler()); - } - - @Test - public void testToHTMLContentHandler() throws Exception { - assertStartDocument("", new ToHTMLContentHandler()); - assertCharacters("content", new ToHTMLContentHandler()); - assertCharacterEscaping("<&\">", new ToHTMLContentHandler()); - assertIgnorableWhitespace(" \t\r\n", new ToHTMLContentHandler()); - assertEmptyElement("<br>", new ToHTMLContentHandler()); - assertEmptyElementWithAttributes( - "<meta name=\"foo\" value=\"bar\">", - new ToHTMLContentHandler()); - assertEmptyElementWithAttributeEscaping( - "<p class=\"<&">\"></p>", - new ToHTMLContentHandler()); - assertElement("<p>content</p>", new ToHTMLContentHandler()); - assertElementWithAttributes( - "<p class=\"test\">content</p>", - new ToHTMLContentHandler()); - } - - private void assertStartDocument(String expected, ContentHandler handler) - throws Exception { - handler.startDocument(); - assertEquals(expected, handler.toString()); - } - - private void assertCharacters(String expected, ContentHandler handler) - throws Exception { - handler.characters("content".toCharArray(), 0, 7); - assertEquals(expected, handler.toString()); - } - - private void assertCharacterEscaping( - String expected, ContentHandler handler) throws Exception { - handler.characters("<&\">".toCharArray(), 0, 4); - assertEquals(expected, handler.toString()); - } - - private void assertIgnorableWhitespace( - String expected, ContentHandler handler) throws Exception { - handler.ignorableWhitespace(" \t\r\n".toCharArray(), 0, 4); - assertEquals(expected, handler.toString()); - } - - private void assertEmptyElement(String expected, ContentHandler handler) - throws Exception { - AttributesImpl attributes = new AttributesImpl(); - handler.startElement("", "br", "br", attributes); - handler.endElement("", "br", "br"); - assertEquals(expected, handler.toString()); - } - - private void assertEmptyElementWithAttributes( - String expected, ContentHandler handler) throws Exception { - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "name", "name", "CDATA", "foo"); - attributes.addAttribute("", "value", "value", "CDATA", "bar"); - handler.startElement("", "meta", "meta", attributes); - handler.endElement("", "meta", "meta"); - assertEquals(expected, handler.toString()); - } - - private void assertEmptyElementWithAttributeEscaping( - String expected, ContentHandler handler) throws Exception { - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "class", "class", "CDATA", "<&\">"); - handler.startElement("", "p", "p", attributes); - handler.endElement("", "p", "p"); - assertEquals(expected, handler.toString()); - } - - private void assertElement( - String expected, ContentHandler handler) throws Exception { - AttributesImpl attributes = new AttributesImpl(); - handler.startElement("", "p", "p", attributes); - handler.characters("content".toCharArray(), 0, 7); - handler.endElement("", "p", "p"); - assertEquals(expected, handler.toString()); - } - - private void assertElementWithAttributes( - String expected, ContentHandler handler) throws Exception { - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "class", "class", "CDATA", "test"); - handler.startElement("", "p", "p", attributes); - handler.characters("content".toCharArray(), 0, 7); - handler.endElement("", "p", "p"); - assertEquals(expected, handler.toString()); - } - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.sax; + +import static org.junit.Assert.assertEquals; +import org.junit.Test; +import org.xml.sax.ContentHandler; +import org.xml.sax.helpers.AttributesImpl; + +public class SerializerTest { + + @Test + public void testToTextContentHandler() throws Exception { + assertStartDocument("", new ToTextContentHandler()); + assertCharacters("content", new ToTextContentHandler()); + assertCharacterEscaping("<&\">", new ToTextContentHandler()); + assertIgnorableWhitespace(" \t\r\n", new ToTextContentHandler()); + assertEmptyElement("", new ToTextContentHandler()); + assertEmptyElementWithAttributes("", new ToTextContentHandler()); + assertEmptyElementWithAttributeEscaping("", new ToTextContentHandler()); + assertElement("content", new ToTextContentHandler()); + assertElementWithAttributes("content", new ToTextContentHandler()); + } + + @Test + public void testToXMLContentHandler() throws Exception { + assertStartDocument("", new ToXMLContentHandler()); + assertStartDocument( + "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n", + new ToXMLContentHandler("UTF-8")); + assertCharacters("content", new ToXMLContentHandler()); + assertCharacterEscaping("<&\">", new ToXMLContentHandler()); + assertIgnorableWhitespace(" \t\r\n", new ToXMLContentHandler()); + assertEmptyElement("<br />", new ToXMLContentHandler()); + assertEmptyElementWithAttributes( + "<meta name=\"foo\" value=\"bar\" />", + new ToXMLContentHandler()); + assertEmptyElementWithAttributeEscaping( + "<p class=\"<&">\" />", + new ToXMLContentHandler()); + assertElement("<p>content</p>", new ToXMLContentHandler()); + assertElementWithAttributes( + "<p class=\"test\">content</p>", + new ToXMLContentHandler()); + } + + @Test + public void testToHTMLContentHandler() throws Exception { + assertStartDocument("", new ToHTMLContentHandler()); + assertCharacters("content", new ToHTMLContentHandler()); + assertCharacterEscaping("<&\">", new ToHTMLContentHandler()); + assertIgnorableWhitespace(" \t\r\n", new ToHTMLContentHandler()); + assertEmptyElement("<br>", new ToHTMLContentHandler()); + assertEmptyElementWithAttributes( + "<meta name=\"foo\" value=\"bar\">", + new ToHTMLContentHandler()); + assertEmptyElementWithAttributeEscaping( + "<p class=\"<&">\"></p>", + new ToHTMLContentHandler()); + assertElement("<p>content</p>", new ToHTMLContentHandler()); + assertElementWithAttributes( + "<p class=\"test\">content</p>", + new ToHTMLContentHandler()); + } + + private void assertStartDocument(String expected, ContentHandler handler) + throws Exception { + handler.startDocument(); + assertEquals(expected, handler.toString()); + } + + private void assertCharacters(String expected, ContentHandler handler) + throws Exception { + handler.characters("content".toCharArray(), 0, 7); + assertEquals(expected, handler.toString()); + } + + private void assertCharacterEscaping( + String expected, ContentHandler handler) throws Exception { + handler.characters("<&\">".toCharArray(), 0, 4); + assertEquals(expected, handler.toString()); + } + + private void assertIgnorableWhitespace( + String expected, ContentHandler handler) throws Exception { + handler.ignorableWhitespace(" \t\r\n".toCharArray(), 0, 4); + assertEquals(expected, handler.toString()); + } + + private void assertEmptyElement(String expected, ContentHandler handler) + throws Exception { + AttributesImpl attributes = new AttributesImpl(); + handler.startElement("", "br", "br", attributes); + handler.endElement("", "br", "br"); + assertEquals(expected, handler.toString()); + } + + private void assertEmptyElementWithAttributes( + String expected, ContentHandler handler) throws Exception { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "name", "name", "CDATA", "foo"); + attributes.addAttribute("", "value", "value", "CDATA", "bar"); + handler.startElement("", "meta", "meta", attributes); + handler.endElement("", "meta", "meta"); + assertEquals(expected, handler.toString()); + } + + private void assertEmptyElementWithAttributeEscaping( + String expected, ContentHandler handler) throws Exception { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "<&\">"); + handler.startElement("", "p", "p", attributes); + handler.endElement("", "p", "p"); + assertEquals(expected, handler.toString()); + } + + private void assertElement( + String expected, ContentHandler handler) throws Exception { + AttributesImpl attributes = new AttributesImpl(); + handler.startElement("", "p", "p", attributes); + handler.characters("content".toCharArray(), 0, 7); + handler.endElement("", "p", "p"); + assertEquals(expected, handler.toString()); + } + + private void assertElementWithAttributes( + String expected, ContentHandler handler) throws Exception { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "test"); + handler.startElement("", "p", "p", attributes); + handler.characters("content".toCharArray(), 0, 7); + handler.endElement("", "p", "p"); + assertEquals(expected, handler.toString()); + } + +}