Author: lryan
Date: Fri Nov 21 14:33:15 2008
New Revision: 719736

URL: http://svn.apache.org/viewvc?rev=719736&view=rev
Log:
Do not inject doctypes if none existed in the original content.

Added:
    
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-fulldocnodoctype.html
Modified:
    
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/GadgetHtmlParser.java
    
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoHtmlParser.java
    
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoSimplifiedHtmlParser.java
    
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/nekohtml/NekoParsersTest.java
    
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-expected.html
    
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-fragment-expected.html
    
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-headnobody-expected.html
    
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test.html

Modified: 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/GadgetHtmlParser.java
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/GadgetHtmlParser.java?rev=719736&r1=719735&r2=719736&view=diff
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/GadgetHtmlParser.java
 (original)
+++ 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/GadgetHtmlParser.java
 Fri Nov 21 14:33:15 2008
@@ -51,7 +51,7 @@
    * @return true if we detect a preamble of doctype or html
    */
   protected static boolean attemptFullDocParseFirst(String content) {
-    String normalized = content.substring(Math.min(100, 
content.length())).toUpperCase();
+    String normalized = content.substring(0, Math.min(100, 
content.length())).toUpperCase();
     return normalized.contains("<!DOCTYPE") || normalized.contains("<HTML");
   }
 

Modified: 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoHtmlParser.java
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoHtmlParser.java?rev=719736&r1=719735&r2=719736&view=diff
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoHtmlParser.java
 (original)
+++ 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoHtmlParser.java
 Fri Nov 21 14:33:15 2008
@@ -17,7 +17,6 @@
  */
 package org.apache.shindig.gadgets.parse.nekohtml;
 
-import org.apache.shindig.common.xml.DomUtil;
 import org.apache.shindig.gadgets.GadgetException;
 import org.apache.shindig.gadgets.parse.GadgetHtmlParser;
 import org.apache.shindig.gadgets.parse.HtmlSerializer;
@@ -28,6 +27,7 @@
 import org.apache.xml.serialize.HTMLSerializer;
 import org.apache.xml.serialize.OutputFormat;
 import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.cyberneko.html.parsers.DOMParser;
 import org.w3c.dom.DOMImplementation;
 import org.w3c.dom.Document;
 import org.w3c.dom.DocumentFragment;
@@ -69,26 +69,34 @@
 
   private Document parseFragment(String source) throws SAXException, 
IOException {
     InputSource input = new InputSource(new StringReader(source));
-    DOMFragmentParser parser = new DOMFragmentParser();
-    parser.setProperty("http://cyberneko.org/html/properties/names/elems";, 
"default");
-    parser.setFeature("http://cyberneko.org/html/features/document-fragment";, 
true);
-
-    Document htmlDoc = documentProvider.createDocument(null, null, null);
-    DocumentFragment fragment = htmlDoc.createDocumentFragment();
-    parser.parse(input, fragment);
-    normalizeFragment(htmlDoc, fragment);
-    return htmlDoc;
+    if (attemptFullDocParseFirst(source)) {
+      DOMParser parser = new DOMParser();
+      // Force parser not to use HTMLDocumentImpl as document implementation
+      
parser.setProperty("http://apache.org/xml/properties/dom/document-class-name";, 
null);
+      parser.setProperty("http://cyberneko.org/html/properties/names/elems";, 
"default");
+      parser.parse(input);
+      return parser.getDocument();
+    } else {
+      Document htmlDoc = documentProvider.createDocument(null, null, null);
+      DOMFragmentParser parser = new DOMFragmentParser();
+      parser.setProperty("http://cyberneko.org/html/properties/names/elems";, 
"default");
+      
parser.setFeature("http://cyberneko.org/html/features/document-fragment";, true);
+      DocumentFragment fragment = htmlDoc.createDocumentFragment();
+      parser.parse(input, fragment);
+      normalizeFragment(htmlDoc, fragment);
+      return htmlDoc;
+    }
   }
 
   static class Serializer extends HtmlSerializer {
 
-    static final OutputFormat outputFormat = new OutputFormat();
-    static {
+    public String serializeImpl(Document doc) {
+      OutputFormat outputFormat = new OutputFormat();
       outputFormat.setPreserveSpace(true);
       outputFormat.setPreserveEmptyAttributes(false);
-    }
-
-    public String serializeImpl(Document doc) {
+      if (doc.getDoctype() == null) {
+        outputFormat.setOmitDocumentType(true);
+      }
       StringWriter sw = createWriter(doc);
       HTMLSerializer serializer = new HTMLSerializer(sw, outputFormat);
       try {

Modified: 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoSimplifiedHtmlParser.java
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoSimplifiedHtmlParser.java?rev=719736&r1=719735&r2=719736&view=diff
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoSimplifiedHtmlParser.java
 (original)
+++ 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoSimplifiedHtmlParser.java
 Fri Nov 21 14:33:15 2008
@@ -320,14 +320,14 @@
 
   static class Serializer extends HtmlSerializer {
 
-    static final OutputFormat outputFormat = new OutputFormat();
-    static {
-      outputFormat.setPreserveSpace(true);
-      outputFormat.setPreserveEmptyAttributes(false);
-    }
-
     @Override
     public String serializeImpl(Document doc) {
+      OutputFormat outputFormat = new OutputFormat();
+      outputFormat.setPreserveSpace(true);
+      outputFormat.setPreserveEmptyAttributes(false);
+      if (doc.getDoctype() == null) {
+        outputFormat.setOmitDocumentType(true);
+      }
       StringWriter sw = createWriter(doc);
       HTMLSerializer serializer = new HTMLSerializer(sw, outputFormat) {
         // Overridden to prevent escaping of literal text

Modified: 
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/nekohtml/NekoParsersTest.java
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/nekohtml/NekoParsersTest.java?rev=719736&r1=719735&r2=719736&view=diff
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/nekohtml/NekoParsersTest.java
 (original)
+++ 
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/nekohtml/NekoParsersTest.java
 Fri Nov 21 14:33:15 2008
@@ -36,7 +36,8 @@
   private NekoHtmlParser full = new NekoHtmlParser(
         new ParseModule.DOMImplementationProvider().get());
 
-  public void testParser() throws Exception {
+  public void testDocWithDoctype() throws Exception {
+    // Note that doctype is properly retained
     String content = IOUtils.toString(this.getClass().getClassLoader().
         
getResourceAsStream("org/apache/shindig/gadgets/parse/nekohtml/test.html"));
     String expected = IOUtils.toString(this.getClass().getClassLoader().
@@ -45,7 +46,16 @@
     parseAndCompareBalanced(content, expected, simple);
   }
 
+  public void testDocNoDoctype() throws Exception {
+    // Note that no doctype is properly created when none specified
+    String content = IOUtils.toString(this.getClass().getClassLoader().
+        
getResourceAsStream("org/apache/shindig/gadgets/parse/nekohtml/test-fulldocnodoctype.html"));
+    assertNull(full.parseDom(content).getDoctype());
+    assertNull(simple.parseDom(content).getDoctype());
+  }
+
   public void testNotADocument() throws Exception {
+    // Note that no doctype is injected for fragments
     String content = IOUtils.toString(this.getClass().getClassLoader().
         
getResourceAsStream("org/apache/shindig/gadgets/parse/nekohtml/test-fragment.html"));
     String expected = IOUtils.toString(this.getClass().getClassLoader().
@@ -55,6 +65,7 @@
   }
 
   public void testNoBody() throws Exception {
+    // Note that no doctype is injected for fragments
     String content = IOUtils.toString(this.getClass().getClassLoader().
         
getResourceAsStream("org/apache/shindig/gadgets/parse/nekohtml/test-headnobody.html"));
     String expected = IOUtils.toString(this.getClass().getClassLoader().

Modified: 
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-expected.html
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-expected.html?rev=719736&r1=719735&r2=719736&view=diff
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-expected.html
 (original)
+++ 
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-expected.html
 Fri Nov 21 14:33:15 2008
@@ -1,4 +1,4 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" 
"http://www.w3.org/TR/html4/strict.dtd";>
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" 
"http://www.w3.org/TR/html4/loose.dtd";>
 <html>
 <head id="head">
   <link href="http://www.example.org/css.css"; rel="stylesheet" type="text/css">

Modified: 
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-fragment-expected.html
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-fragment-expected.html?rev=719736&r1=719735&r2=719736&view=diff
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-fragment-expected.html
 (original)
+++ 
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-fragment-expected.html
 Fri Nov 21 14:33:15 2008
@@ -1,3 +1,2 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" 
"http://www.w3.org/TR/html4/strict.dtd";>
 <html><head></head><body><script>document.write("dont add to head or 
else")</script>
 <style type="text/css"> can go in head</style></body></html>
\ No newline at end of file

Added: 
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-fulldocnodoctype.html
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-fulldocnodoctype.html?rev=719736&view=auto
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-fulldocnodoctype.html
 (added)
+++ 
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-fulldocnodoctype.html
 Fri Nov 21 14:33:15 2008
@@ -0,0 +1,6 @@
+<html>
+  <head></head>
+  <body>
+  <!-- This is a full doc with no doctype -->
+  </body>
+</html>
\ No newline at end of file

Modified: 
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-headnobody-expected.html
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-headnobody-expected.html?rev=719736&r1=719735&r2=719736&view=diff
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-headnobody-expected.html
 (original)
+++ 
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test-headnobody-expected.html
 Fri Nov 21 14:33:15 2008
@@ -1,4 +1,3 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" 
"http://www.w3.org/TR/html4/strict.dtd";>
 <html><head>
     <!-- A head tag but no body tag is not good -->
 </head><body>

Modified: 
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test.html
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test.html?rev=719736&r1=719735&r2=719736&view=diff
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test.html
 (original)
+++ 
incubator/shindig/trunk/java/gadgets/src/test/resources/org/apache/shindig/gadgets/parse/nekohtml/test.html
 Fri Nov 21 14:33:15 2008
@@ -1,4 +1,4 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" 
"http://www.w3.org/TR/html4/strict.dtd";>
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" 
"http://www.w3.org/TR/html4/loose.dtd";>
 <html>
 <head id="head">
   <link href="http://www.example.org/css.css"; rel="stylesheet" type="text/css">


Reply via email to