Author: jukka Date: Thu Apr 10 03:52:06 2008 New Revision: 646748 URL: http://svn.apache.org/viewvc?rev=646748&view=rev Log: TIKA-113: Metadata (such as title) should not be part of content - Added BodyContentHandler that only processes XHTML body events - Added utility constructors for WriteOutContentHandler and BodyContentHandler - Updated test cases and related code to use BodyContentHandler where appropriate - Removed AppendableAdaptor class as it's not used anymore
Added: incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java Removed: incubator/tika/trunk/src/main/java/org/apache/tika/sax/AppendableAdaptor.java incubator/tika/trunk/src/test/java/org/apache/tika/sax/AppendableAdaptorTest.java Modified: incubator/tika/trunk/CHANGES.txt incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java incubator/tika/trunk/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java incubator/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java incubator/tika/trunk/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java incubator/tika/trunk/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java Modified: incubator/tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=646748&r1=646747&r2=646748&view=diff ============================================================================== --- incubator/tika/trunk/CHANGES.txt (original) +++ incubator/tika/trunk/CHANGES.txt Thu Apr 10 03:52:06 2008 @@ -43,6 +43,8 @@ 18. TIKA-138 - Ignore HTML style and script content (Jukka Zitting) +19. TIKA-113 - Metadata (such as title) should not be part of content + (Jukka Zitting) Release 0.1-incubating - 12/27/2007 Modified: incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=646748&r1=646747&r2=646748&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java Thu Apr 10 03:52:06 2008 @@ -18,11 +18,8 @@ import java.io.File; import java.io.FileInputStream; -import java.io.IOException; import java.io.InputStream; -import java.io.OutputStreamWriter; import java.io.PrintStream; -import java.io.Writer; import java.net.URL; import java.util.Arrays; @@ -41,12 +38,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; -import org.apache.tika.sax.WriteOutContentHandler; -import org.apache.tika.sax.XHTMLContentHandler; -import org.apache.tika.sax.xpath.MatchingContentHandler; -import org.apache.tika.sax.xpath.XPathParser; +import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; /** @@ -176,17 +169,7 @@ } private ContentHandler getTextContentHandler() { - final Writer writer = new OutputStreamWriter(System.out); - XPathParser parser = - new XPathParser("xhtml", XHTMLContentHandler.XHTML); - return new MatchingContentHandler( - new WriteOutContentHandler(writer), - parser.parse("/xhtml:html/xhtml:body//text()")) { - public void endDocument() throws SAXException { - super.endDocument(); - try { writer.flush(); } catch (IOException e) {} - } - }; + return new BodyContentHandler(System.out); } private ContentHandler getMetadataContentHandler() { Modified: incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=646748&r1=646747&r2=646748&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java Thu Apr 10 03:52:06 2008 @@ -43,12 +43,10 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerDecorator; import org.apache.tika.sax.TeeContentHandler; -import org.apache.tika.sax.WriteOutContentHandler; import org.apache.tika.sax.XHTMLContentHandler; -import org.apache.tika.sax.xpath.MatchingContentHandler; -import org.apache.tika.sax.xpath.XPathParser; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -256,11 +254,7 @@ } private ContentHandler getTextContentHandler(Writer writer) { - XPathParser parser = - new XPathParser("xhtml", XHTMLContentHandler.XHTML); - return new MatchingContentHandler( - new WriteOutContentHandler(writer), - parser.parse("/xhtml:html/xhtml:body//text()")); + return new BodyContentHandler(writer); } private ContentHandler getXmlContentHandler(Writer writer) Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java?rev=646748&r1=646747&r2=646748&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java Thu Apr 10 03:52:06 2008 @@ -18,12 +18,11 @@ import java.io.IOException; import java.io.InputStream; -import java.io.StringWriter; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.TeeContentHandler; -import org.apache.tika.sax.WriteOutContentHandler; import org.apache.tika.utils.RegexUtils; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -53,12 +52,10 @@ public void parse( InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { - StringWriter writer = new StringWriter(); - handler = new TeeContentHandler( - handler, new WriteOutContentHandler(writer)); - super.parse(stream, handler, metadata); + ContentHandler body = new BodyContentHandler(); + super.parse(stream, new TeeContentHandler(handler, body), metadata); - String content = writer.toString(); + String content = body.toString(); metadata.set("fulltext", content); int length = Math.min(content.length(), 500); Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=646748&r1=646747&r2=646748&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Thu Apr 10 03:52:06 2008 @@ -18,7 +18,6 @@ import java.io.IOException; import java.io.InputStream; -import java.io.StringWriter; import java.util.HashMap; import java.util.HashSet; import java.util.Map; @@ -109,11 +108,10 @@ } private ContentHandler getTitleHandler(final Metadata metadata) { - final StringWriter writer = new StringWriter(); - return new WriteOutContentHandler(writer) { + return new WriteOutContentHandler() { @Override public void endElement(String u, String l, String n) { - metadata.set(Metadata.TITLE, writer.toString()); + metadata.set(Metadata.TITLE, toString()); } }; } Added: incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java?rev=646748&view=auto ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java (added) +++ incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java Thu Apr 10 03:52:06 2008 @@ -0,0 +1,84 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.sax; + +import java.io.OutputStream; +import java.io.Writer; + +import org.apache.tika.sax.xpath.Matcher; +import org.apache.tika.sax.xpath.MatchingContentHandler; +import org.apache.tika.sax.xpath.XPathParser; +import org.xml.sax.ContentHandler; + +/** + * Content handler decorator that only passes the XHTML <body/> + * tag and everything inside it to the underlying handler. + */ +public class BodyContentHandler extends ContentHandlerDecorator { + + /** + * XHTML XPath parser. + */ + private static final XPathParser PARSER = + new XPathParser("xhtml", XHTMLContentHandler.XHTML); + + /** + * The XPath matcher used to select the XHTML body contents. + */ + private static final Matcher MATCHER = + PARSER.parse("/xhtml:html/xhtml:body//node()"); + + /** + * Creates a content handler that passes all XHTML body events to the + * given underlying content handler. + * + * @param handler content handler + */ + public BodyContentHandler(ContentHandler handler) { + super(new MatchingContentHandler(handler, MATCHER)); + } + + /** + * Creates a content handler that writes XHTML body character events to + * the given writer. + * + * @param writer writer + */ + public BodyContentHandler(Writer writer) { + this(new WriteOutContentHandler(writer)); + } + + /** + * Creates a content handler that writes XHTML body character events to + * the given output stream using the default encoding. + * + * @param stream output stream + */ + public BodyContentHandler(OutputStream stream) { + this(new WriteOutContentHandler(stream)); + } + + /** + * Creates a content handler that writes XHTML body character events to + * an internal string buffer. The contents of the buffer can be retrieved + * using the [EMAIL PROTECTED] #toString()} method. + */ + public BodyContentHandler() { + this(new WriteOutContentHandler()); + } + +} Modified: incubator/tika/trunk/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java?rev=646748&r1=646747&r2=646748&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java Thu Apr 10 03:52:06 2008 @@ -94,4 +94,8 @@ handler.skippedEntity(name); } + public String toString() { + return handler.toString(); + } + } Modified: incubator/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java?rev=646748&r1=646747&r2=646748&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java Thu Apr 10 03:52:06 2008 @@ -17,6 +17,9 @@ package org.apache.tika.sax; import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.StringWriter; import java.io.Writer; import org.xml.sax.SAXException; @@ -33,10 +36,39 @@ */ private final Writer writer; + /** + * Creates a content handler that writes character events to + * the given writer. + * + * @param writer writer + */ public WriteOutContentHandler(Writer writer) { this.writer = writer; } + /** + * Creates a content handler that writes character events to + * the given output stream using the default encoding. + * + * @param stream output stream + */ + public WriteOutContentHandler(OutputStream stream) { + this(new OutputStreamWriter(stream)); + } + + /** + * Creates a content handler that writes character events + * to an internal string buffer. Use the [EMAIL PROTECTED] #toString()} + * method to access the collected character content. + */ + public WriteOutContentHandler() { + this(new StringWriter()); + } + + /** + * Writes the given characters to the given character stream. + */ + @Override public void characters(char[] ch, int start, int length) throws SAXException { try { @@ -44,6 +76,18 @@ } catch (IOException e) { throw new SAXException("Error writing out character content", e); } + } + + /** + * Returns the contents of the internal string buffer where + * all the received characters have been collected. Only works + * when this object was constructed using the empty default + * constructor or by passing a [EMAIL PROTECTED] StringWriter} to the + * other constructor. + */ + @Override + public String toString() { + return writer.toString(); } } Modified: incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java?rev=646748&r1=646747&r2=646748&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java Thu Apr 10 03:52:06 2008 @@ -23,7 +23,6 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; -import java.io.StringWriter; import java.net.URL; import java.util.ArrayList; import java.util.List; @@ -33,7 +32,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMimeKeys; import org.apache.tika.parser.Parser; -import org.apache.tika.sax.WriteOutContentHandler; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** @@ -169,10 +169,9 @@ throws TikaException, IOException { try { Parser parser = config.getParser(mimeType); - StringWriter writer = new StringWriter(); - parser.parse( - stream, new WriteOutContentHandler(writer), new Metadata()); - return writer.toString(); + ContentHandler handler = new BodyContentHandler(); + parser.parse(stream, handler, new Metadata()); + return handler.toString(); } catch (SAXException e) { throw new TikaException("Unexpected SAX error", e); } Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java?rev=646748&r1=646747&r2=646748&view=diff ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java (original) +++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java Thu Apr 10 03:52:06 2008 @@ -18,12 +18,11 @@ import java.io.IOException; import java.io.InputStream; -import java.io.StringWriter; import org.apache.commons.lang.builder.ReflectionToStringBuilder; import org.apache.commons.lang.builder.ToStringStyle; import org.apache.tika.metadata.Metadata; -import org.apache.tika.sax.WriteOutContentHandler; +import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; import junit.framework.TestCase; @@ -63,15 +62,14 @@ Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, tp.resourceStatedName); metadata.set(Metadata.CONTENT_TYPE, tp.statedType); - StringWriter writer = new StringWriter(); - ContentHandler handler = new WriteOutContentHandler(writer); + ContentHandler handler = new BodyContentHandler(); new AutoDetectParser().parse(input, handler, metadata); assertEquals("Bad content type: " + tp, tp.realType, metadata.get(Metadata.CONTENT_TYPE)); assertTrue("Expected content not found: " + tp, - writer.toString().contains(tp.expectedContentFragment)); + handler.toString().contains(tp.expectedContentFragment)); } finally { input.close(); } Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=646748&r1=646747&r2=646748&view=diff ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original) +++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Thu Apr 10 03:52:06 2008 @@ -26,9 +26,12 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.Parser; -import org.apache.tika.sax.WriteOutContentHandler; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.TeeContentHandler; import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; public class HtmlParserTest extends TestCase { @@ -40,32 +43,32 @@ } public void testParseAscii() throws Exception { - StringWriter writer = new StringWriter(); final StringWriter href = new StringWriter(); - + + ContentHandler body = new BodyContentHandler(); + ContentHandler link = new DefaultHandler() { + @Override + public void startElement( + String u, String l, String n, Attributes a) + throws SAXException { + if ("a".equals(l)) { + href.append(a.getValue("href")); + } + } + }; Metadata metadata = new Metadata(); - parser.parse( - getStream("test-documents/testHTML.html"), - new WriteOutContentHandler(writer) { - @Override - public void startElement( - String uri, String local, String name, - Attributes attributes) { - if ("a".equals(local)) { - href.append(attributes.getValue("href")); - } - } - }, - metadata); + InputStream stream = getStream("test-documents/testHTML.html"); + try { + parser.parse(stream, new TeeContentHandler(body, link), metadata); + } finally { + stream.close(); + } assertEquals( "Title : Test Indexation Html", metadata.get(Metadata.TITLE)); assertEquals("http://www.apache.org/", href.toString()); - String content = writer.toString(); - assertTrue( - "Did not contain expected text: Title : Test Indexation Html", - content.contains("Title : Test Indexation Html")); + String content = body.toString(); assertTrue( "Did not contain expected text:" + "Test Indexation Html", content.contains("Test Indexation Html")); @@ -76,13 +79,13 @@ } public void XtestParseUTF8() throws IOException, SAXException, TikaException { - - StringWriter writer = new StringWriter(); + ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); - parser.parse(getStream("test-documents/testHTML_utf8.html"), - new WriteOutContentHandler(writer), metadata); - String content = writer.toString(); + parser.parse( + getStream("test-documents/testHTML_utf8.html"), + handler, metadata); + String content = handler.toString(); assertTrue("Did not contain expected text:" + "Title : Tilte with UTF-8 chars öäå", content @@ -100,8 +103,9 @@ public void testParseEmpty() throws Exception { Metadata metadata = new Metadata(); StringWriter writer = new StringWriter(); - parser.parse(new ByteArrayInputStream(new byte[0]), - new WriteOutContentHandler(writer), metadata); + parser.parse( + new ByteArrayInputStream(new byte[0]), + new BodyContentHandler(writer), metadata); String content = writer.toString(); assertEquals("", content); } Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=646748&r1=646747&r2=646748&view=diff ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java (original) +++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java Thu Apr 10 03:52:06 2008 @@ -17,14 +17,13 @@ package org.apache.tika.parser.microsoft; import java.io.InputStream; -import java.io.StringWriter; + +import junit.framework.TestCase; import org.apache.tika.metadata.Metadata; -import org.apache.tika.sax.WriteOutContentHandler; +import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; -import junit.framework.TestCase; - public class ExcelParserTest extends TestCase { public void testExcelParser() throws Exception { @@ -32,8 +31,7 @@ "/test-documents/testEXCEL.xls"); try { Metadata metadata = new Metadata(); - StringWriter writer = new StringWriter(); - ContentHandler handler = new WriteOutContentHandler(writer); + ContentHandler handler = new BodyContentHandler(); new OfficeParser().parse(input, handler, metadata); assertEquals( @@ -41,7 +39,7 @@ metadata.get(Metadata.CONTENT_TYPE)); assertEquals("Simple Excel document", metadata.get(Metadata.TITLE)); assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR)); - String content = writer.toString(); + String content = handler.toString(); assertTrue(content.contains("Sample Excel Worksheet")); assertTrue(content.contains("Numbers and their Squares")); assertTrue(content.contains("9")); Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=646748&r1=646747&r2=646748&view=diff ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java (original) +++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java Thu Apr 10 03:52:06 2008 @@ -17,10 +17,9 @@ package org.apache.tika.parser.microsoft; import java.io.InputStream; -import java.io.StringWriter; import org.apache.tika.metadata.Metadata; -import org.apache.tika.sax.WriteOutContentHandler; +import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; import junit.framework.TestCase; @@ -32,8 +31,7 @@ "/test-documents/testPPT.ppt"); try { Metadata metadata = new Metadata(); - StringWriter writer = new StringWriter(); - ContentHandler handler = new WriteOutContentHandler(writer); + ContentHandler handler = new BodyContentHandler(); new OfficeParser().parse(input, handler, metadata); assertEquals( @@ -41,7 +39,7 @@ metadata.get(Metadata.CONTENT_TYPE)); assertEquals("Sample Powerpoint Slide", metadata.get(Metadata.TITLE)); assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR)); - String content = writer.toString(); + String content = handler.toString(); assertTrue(content.contains("Sample Powerpoint Slide")); assertTrue(content.contains("Powerpoint X for Mac")); } finally { Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=646748&r1=646747&r2=646748&view=diff ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original) +++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Thu Apr 10 03:52:06 2008 @@ -17,10 +17,9 @@ package org.apache.tika.parser.microsoft; import java.io.InputStream; -import java.io.StringWriter; import org.apache.tika.metadata.Metadata; -import org.apache.tika.sax.WriteOutContentHandler; +import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; import junit.framework.TestCase; @@ -31,9 +30,8 @@ InputStream input = WordParserTest.class.getResourceAsStream( "/test-documents/testWORD.doc"); try { + ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); - StringWriter writer = new StringWriter(); - ContentHandler handler = new WriteOutContentHandler(writer); new OfficeParser().parse(input, handler, metadata); assertEquals( @@ -41,8 +39,7 @@ metadata.get(Metadata.CONTENT_TYPE)); assertEquals("Sample Word Document", metadata.get(Metadata.TITLE)); assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR)); - String content = writer.toString(); - assertTrue(content.contains("Sample Word Document")); + assertTrue(handler.toString().contains("Sample Word Document")); } finally { input.close(); } Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java?rev=646748&r1=646747&r2=646748&view=diff ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java (original) +++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java Thu Apr 10 03:52:06 2008 @@ -17,12 +17,11 @@ package org.apache.tika.parser.opendocument; import java.io.InputStream; -import java.io.StringWriter; import junit.framework.TestCase; import org.apache.tika.metadata.Metadata; -import org.apache.tika.sax.WriteOutContentHandler; +import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; public class OpenOfficeParserTest extends TestCase { @@ -32,8 +31,7 @@ "/test-documents/testOpenOffice2.odt"); try { Metadata metadata = new Metadata(); - StringWriter writer = new StringWriter(); - ContentHandler handler = new WriteOutContentHandler(writer); + ContentHandler handler = new BodyContentHandler(); new OpenOfficeParser().parse(input, handler, metadata); assertEquals( @@ -52,7 +50,7 @@ assertEquals("14", metadata.get("nbWord")); assertEquals("78", metadata.get("nbCharacter")); - String content = writer.toString(); + String content = handler.toString(); assertTrue(content.contains( "This is a sample Open Office document," + " written in NeoOffice 2.2.1 for the Mac.")); Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=646748&r1=646747&r2=646748&view=diff ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java (original) +++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java Thu Apr 10 03:52:06 2008 @@ -21,7 +21,9 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.WriteOutContentHandler; +import org.xml.sax.ContentHandler; import junit.framework.TestCase; @@ -59,30 +61,24 @@ public void testUTF8Text() throws Exception { String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n"; + ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); - StringWriter writer = new StringWriter(); parser.parse( new ByteArrayInputStream(text.getBytes("UTF-8")), - new WriteOutContentHandler(writer), - metadata); - String content = writer.toString(); - + handler, metadata); assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); - assertTrue(content.contains(text)); + assertTrue(handler.toString().contains(text)); } public void testEmptyText() throws Exception { + ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); - StringWriter writer = new StringWriter(); parser.parse( - new ByteArrayInputStream(new byte[0]), - new WriteOutContentHandler(writer), - metadata); - String content = writer.toString(); + new ByteArrayInputStream(new byte[0]), handler, metadata); assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("", content); + assertEquals("", handler.toString()); } } Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=646748&r1=646747&r2=646748&view=diff ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java (original) +++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java Thu Apr 10 03:52:06 2008 @@ -17,13 +17,13 @@ package org.apache.tika.parser.xml; import java.io.InputStream; -import java.io.StringWriter; import junit.framework.TestCase; import org.apache.tika.metadata.Metadata; -import org.apache.tika.sax.WriteOutContentHandler; +import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; +import org.xml.sax.helpers.DefaultHandler; public class DcXMLParserTest extends TestCase { @@ -32,8 +32,7 @@ "/test-documents/testXML.xml"); try { Metadata metadata = new Metadata(); - StringWriter writer = new StringWriter(); - ContentHandler handler = new WriteOutContentHandler(writer); + ContentHandler handler = new BodyContentHandler(); new DcXMLParser().parse(input, handler, metadata); assertEquals( @@ -55,7 +54,7 @@ assertEquals("Fr", metadata.get(Metadata.LANGUAGE)); assertTrue(metadata.get(Metadata.RIGHTS).contains("testing chars")); - String content = writer.toString(); + String content = handler.toString(); assertTrue(content.contains("Tika test document")); } finally { input.close(); @@ -66,9 +65,7 @@ InputStream input = DcXMLParserTest.class.getResourceAsStream("/test-documents/testXML.xml"); try { Metadata metadata = new Metadata(); - StringWriter writer = new StringWriter(); - ContentHandler handler = new WriteOutContentHandler(writer); - new DcXMLParser().parse(input, handler, metadata); + new DcXMLParser().parse(input, new DefaultHandler(), metadata); final String expected = "Archim\u00E8de et Lius \u00E0 Ch\u00E2teauneuf testing chars en \u00E9t\u00E9"; assertEquals(expected,metadata.get(Metadata.RIGHTS));