Author: tpalsulich
Date: Tue Mar 3 02:18:53 2015
New Revision: 1663513
URL: http://svn.apache.org/r1663513
Log:
TIKA-995. Properly output XHTML body attributes, contributed by Markus Jelsma.
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=1663513&r1=1663512&r2=1663513&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
Tue Mar 3 02:18:53 2015
@@ -60,7 +60,7 @@ public class XHTMLContentHandler extends
* skip them if they get sent to startElement/endElement by mistake.
*/
private static final Set<String> AUTO =
- unmodifiableSet("html", "head", "body", "frameset");
+ unmodifiableSet("html", "head", "frameset");
/**
* The elements that get prepended with the {@link #TAB} character.
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java?rev=1663513&r1=1663512&r2=1663513&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
Tue Mar 3 02:18:53 2015
@@ -17,10 +17,12 @@
package org.apache.tika.sax;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
import java.util.ArrayList;
import java.util.List;
+import org.apache.tika.config.TikaConfigTest;
import org.apache.tika.metadata.Metadata;
import org.junit.Before;
@@ -28,6 +30,7 @@ import org.junit.Test;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
/**
* Unit tests for the {@link XHTMLContentHandler} class.
@@ -121,6 +124,24 @@ public class XHTMLContentHandlerTest {
assertEquals("two", words[1]);
}
+ @Test
+ public void testAttributesOnBody() throws Exception {
+ ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler();
+ XHTMLContentHandler xhtmlContentHandler = new
XHTMLContentHandler(toHTMLContentHandler, new Metadata());
+ AttributesImpl attributes = new AttributesImpl();
+
+ attributes.addAttribute(XHTMLContentHandler.XHTML, "itemscope",
"itemscope", "", "");
+ attributes.addAttribute(XHTMLContentHandler.XHTML, "itemtype",
"itemtype", "", "http://schema.org/Event");
+
+ xhtmlContentHandler.startDocument();
+ xhtmlContentHandler.startElement(XHTMLContentHandler.XHTML, "body",
"body", attributes);
+ xhtmlContentHandler.endElement("body");
+ xhtmlContentHandler.endDocument();
+
+ System.err.println("Content: " + toHTMLContentHandler.toString());
+ assertTrue(toHTMLContentHandler.toString().contains("itemscope"));
+ }
+
/**
* Return array of non-zerolength words. Splitting on whitespace will get
us
* empty words for emptylines.