Tim Starling has submitted this change and it was merged. Change subject: Introduce a custom serializer to resolve some differences with tidy ......................................................................
Introduce a custom serializer to resolve some differences with tidy * Add a terminating slash to void elements * Use   instead of Change-Id: I8a922658dfd1fa56abb4ae2645345c7e15cd8360 --- A src/main/java/org/wikimedia/html5depurate/DepurateSerializer.java M src/main/java/org/wikimedia/html5depurate/Depurator.java 2 files changed, 282 insertions(+), 2 deletions(-) Approvals: Tim Starling: Verified; Looks good to me, approved diff --git a/src/main/java/org/wikimedia/html5depurate/DepurateSerializer.java b/src/main/java/org/wikimedia/html5depurate/DepurateSerializer.java new file mode 100644 index 0000000..9d5fa16 --- /dev/null +++ b/src/main/java/org/wikimedia/html5depurate/DepurateSerializer.java @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * Copyright (c) 2008-2011 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package org.wikimedia.html5depurate; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; +import java.io.Writer; +import java.util.Arrays; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.ext.LexicalHandler; + +/* + * It's necessary to copy this whole file from validator.nu's HtmlSerializer + * just to change one minor detail, mostly because private declarations are used + * instead of protected. + * + * The thing that we changed is to add a slash to void elements. This is allowed + * by the HTML 5 spec, it is documented as having no effect. It allows the + * output to pass XHTML validation. + * + * Also is replaced with   + */ +public class DepurateSerializer implements ContentHandler, LexicalHandler { + + private static final String[] VOID_ELEMENTS = { "area", "base", "basefont", + "bgsound", "br", "col", "command", "embed", "frame", "hr", "img", + "input", "keygen", "link", "meta", "param", "source", "track", + "wbr" }; + + private static final String[] NON_ESCAPING = { "iframe", "noembed", + "noframes", "noscript", "plaintext", "script", "style", "xmp" }; + + private static Writer wrap(OutputStream out) { + try { + return new OutputStreamWriter(out, "UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + } + + private int ignoreLevel = 0; + + private int escapeLevel = 0; + + private final Writer writer; + + public DepurateSerializer(OutputStream out) { + this(wrap(out)); + } + + public DepurateSerializer(Writer out) { + this.writer = out; + } + + public void characters(char[] ch, int start, int length) + throws SAXException { + try { + if (escapeLevel > 0) { + writer.write(ch, start, length); + } else { + for (int i = start; i < start + length; i++) { + char c = ch[i]; + switch (c) { + case '<': + writer.write("<"); + break; + case '>': + writer.write(">"); + break; + case '&': + writer.write("&"); + break; + case '\u00A0': + writer.write(" "); + break; + default: + writer.write(c); + break; + } + } + } + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void endDocument() throws SAXException { + try { + writer.flush(); + writer.close(); + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void endElement(String uri, String localName, String qName) + throws SAXException { + if (escapeLevel > 0) { + escapeLevel--; + } + if (ignoreLevel > 0) { + ignoreLevel--; + } else { + try { + writer.write('<'); + writer.write('/'); + writer.write(localName); + writer.write('>'); + } catch (IOException e) { + throw new SAXException(e); + } + } + } + + public void ignorableWhitespace(char[] ch, int start, int length) + throws SAXException { + characters(ch, start, length); + } + + public void processingInstruction(String target, String data) + throws SAXException { + } + + public void setDocumentLocator(Locator locator) { + } + + public void startDocument() throws SAXException { + try { + writer.write("<!DOCTYPE html>\n"); + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void startElement(String uri, String localName, String qName, + Attributes atts) throws SAXException { + if (escapeLevel > 0) { + escapeLevel++; + } + boolean xhtml = "http://www.w3.org/1999/xhtml".equals(uri); + if (ignoreLevel > 0 + || !(xhtml || "http://www.w3.org/2000/svg".equals(uri) || "http://www.w3.org/1998/Math/MathML".equals(uri))) { + ignoreLevel++; + return; + } + try { + writer.write('<'); + writer.write(localName); + for (int i = 0; i < atts.getLength(); i++) { + String attUri = atts.getURI(i); + String attLocal = atts.getLocalName(i); + if (attUri.length() == 0) { + writer.write(' '); + } else if (!xhtml + && "http://www.w3.org/1999/xlink".equals(attUri)) { + writer.write(" xlink:"); + } else if ("http://www.w3.org/XML/1998/namespace".equals(attUri)) { + if (xhtml) { + if ("lang".equals(attLocal)) { + writer.write(' '); + } else { + continue; + } + } else { + writer.write(" xml:"); + } + } else { + continue; + } + writer.write(atts.getLocalName(i)); + writer.write('='); + writer.write('"'); + String val = atts.getValue(i); + for (int j = 0; j < val.length(); j++) { + char c = val.charAt(j); + switch (c) { + case '"': + writer.write("""); + break; + case '&': + writer.write("&"); + break; + case '\u00A0': + writer.write(" "); + break; + default: + writer.write(c); + break; + } + } + writer.write('"'); + } + if (Arrays.binarySearch(VOID_ELEMENTS, localName) > -1) { + writer.write(" />"); + ignoreLevel++; + return; + } else { + writer.write('>'); + } + if ("pre".equals(localName) || "textarea".equals(localName) + || "listing".equals(localName)) { + writer.write('\n'); + } + if (escapeLevel == 0 + && Arrays.binarySearch(NON_ESCAPING, localName) > -1) { + escapeLevel = 1; + } + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void comment(char[] ch, int start, int length) throws SAXException { + if (ignoreLevel > 0 || escapeLevel > 0) { + return; + } + try { + writer.write("<!--"); + writer.write(ch, start, length); + writer.write("-->"); + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void endCDATA() throws SAXException { + } + + public void endDTD() throws SAXException { + } + + public void endEntity(String name) throws SAXException { + } + + public void startCDATA() throws SAXException { + } + + public void startDTD(String name, String publicId, String systemId) + throws SAXException { + } + + public void startEntity(String name) throws SAXException { + } + + public void startPrefixMapping(String prefix, String uri) + throws SAXException { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } +} diff --git a/src/main/java/org/wikimedia/html5depurate/Depurator.java b/src/main/java/org/wikimedia/html5depurate/Depurator.java index 644d242..46e4545 100644 --- a/src/main/java/org/wikimedia/html5depurate/Depurator.java +++ b/src/main/java/org/wikimedia/html5depurate/Depurator.java @@ -2,7 +2,6 @@ import nu.validator.htmlparser.common.XmlViolationPolicy; import nu.validator.htmlparser.sax.HtmlParser; -import nu.validator.htmlparser.sax.HtmlSerializer; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -16,7 +15,7 @@ throws SAXException, IOException { ByteArrayOutputStream sink = new ByteArrayOutputStream(); - ContentHandler serializer = new HtmlSerializer(sink); + ContentHandler serializer = new DepurateSerializer(sink); HtmlParser parser = new HtmlParser(XmlViolationPolicy.ALLOW); parser.setContentHandler(serializer); parser.setProperty("http://xml.org/sax/properties/lexical-handler", -- To view, visit https://gerrit.wikimedia.org/r/236503 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I8a922658dfd1fa56abb4ae2645345c7e15cd8360 Gerrit-PatchSet: 2 Gerrit-Project: mediawiki/services/html5depurate Gerrit-Branch: master Gerrit-Owner: Tim Starling <tstarl...@wikimedia.org> Gerrit-Reviewer: Tim Starling <tstarl...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits