Author: jukka Date: Wed Mar 26 23:32:00 2008 New Revision: 641712 URL: http://svn.apache.org/viewvc?rev=641712&view=rev Log: TIKA-97: Tika GUI - Simplify the HTML output for JEditorPane to better understand it
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java Modified: incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=641712&r1=641711&r2=641712&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java Wed Mar 26 23:32:00 2008 @@ -43,12 +43,15 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; +import org.apache.tika.sax.ContentHandlerDecorator; import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.sax.WriteOutContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.sax.xpath.MatchingContentHandler; import org.apache.tika.sax.xpath.XPathParser; +import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * Simple Swing GUI for Apache Tika. You can drag and drop files on top @@ -196,6 +199,24 @@ editor.setCaretPosition(0); } + /** + * Creates and returns a content handler that turns XHTML input to + * simplified HTML output that can be correctly parsed and displayed + * by [EMAIL PROTECTED] JEditorPane}. + * <p> + * The returned content handler is set to output <code>html</code> + * to the given writer. The XHTML namespace is removed from the output + * to prevent the serializer from using the <tag/> empty element + * syntax that causes extra ">" characters to be displayed. + * The <head> tags are dropped to prevent the serializer from + * generating a <META> content type tag that makes + * [EMAIL PROTECTED] JEditorPane} fail thinking that the document character set + * is inconsistent. + * + * @param writer output writer + * @return HTML content handler + * @throws TransformerConfigurationException if an error occurs + */ private ContentHandler getHtmlHandler(Writer writer) throws TransformerConfigurationException { SAXTransformerFactory factory = (SAXTransformerFactory) @@ -203,7 +224,35 @@ TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); handler.setResult(new StreamResult(writer)); - return handler; + return new ContentHandlerDecorator(handler) { + @Override + public void startElement( + String uri, String localName, String name, Attributes atts) + throws SAXException { + if (XHTMLContentHandler.XHTML.equals(uri)) { + uri = null; + } + if (!"head".equals(localName)) { + super.startElement(uri, localName, name, atts); + } + } + @Override + public void endElement(String uri, String localName, String name) + throws SAXException { + if (XHTMLContentHandler.XHTML.equals(uri)) { + uri = null; + } + if (!"head".equals(localName)) { + super.endElement(uri, localName, name); + } + } + @Override + public void startPrefixMapping(String prefix, String uri) { + } + @Override + public void endPrefixMapping(String prefix) { + } + }; } private ContentHandler getTextContentHandler(Writer writer) {