This is an automated email from the ASF dual-hosted git repository. paulk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/groovy.git
commit 7039d6a6c9ad2c7f0aba832b57a7317ba867004b Author: Simon Tost <[email protected]> AuthorDate: Sun Aug 30 20:40:21 2020 +0200 Add character filter to MarkupBuilder --- subprojects/groovy-xml/build.gradle | 7 ++ .../src/main/java/groovy/xml/MarkupBuilder.java | 65 ++++++++++- .../xml/MarkupBuilderIllegalCharactersSpec.groovy | 128 +++++++++++++++++++++ 3 files changed, 198 insertions(+), 2 deletions(-) diff --git a/subprojects/groovy-xml/build.gradle b/subprojects/groovy-xml/build.gradle index 2528544..c37fdd3 100644 --- a/subprojects/groovy-xml/build.gradle +++ b/subprojects/groovy-xml/build.gradle @@ -21,6 +21,13 @@ dependencies { testImplementation rootProject.sourceSets.test.runtimeClasspath testImplementation "xmlunit:xmlunit:$xmlunitVersion" testImplementation project(':groovy-test') + testImplementation ("org.spockframework:spock-core:$spockVersion") { + exclude group: 'org.codehaus.groovy' + } +} + +test { + useJUnitPlatform() } task moduleDescriptor(type: org.codehaus.groovy.gradle.WriteExtensionDescriptorTask) { diff --git a/subprojects/groovy-xml/src/main/java/groovy/xml/MarkupBuilder.java b/subprojects/groovy-xml/src/main/java/groovy/xml/MarkupBuilder.java index b7d028e..bdbde82 100644 --- a/subprojects/groovy-xml/src/main/java/groovy/xml/MarkupBuilder.java +++ b/subprojects/groovy-xml/src/main/java/groovy/xml/MarkupBuilder.java @@ -61,6 +61,8 @@ import java.util.Map; * </ul> */ public class MarkupBuilder extends BuilderSupport { + public enum CharFilter { XML_STRICT, XML_ALL, NONE } + private IndentPrinter out; private boolean nospace; private int state; @@ -70,6 +72,7 @@ public class MarkupBuilder extends BuilderSupport { private boolean omitEmptyAttributes = false; private boolean expandEmptyElements = false; private boolean escapeAttributes = true; + private CharFilter characterFilter = CharFilter.NONE; /** * Returns the escapeAttributes property value. @@ -222,6 +225,45 @@ public class MarkupBuilder extends BuilderSupport { this.expandEmptyElements = expandEmptyElements; } + /** + * Returns the current character filter. + * + * @return the character filter used by this builder. + */ + public CharFilter getCharacterFilter() { return this.characterFilter; } + + /** + * Set a filter to limit the characters, that can appear in attribute values and text nodes. + * <p> + * Some unicode character are either not allowed, discouraged or not referenceable with an escape sequence + * by specification. Especially XML parsers might have trouble dealing with some of those characters. + * Since HTML strives for closeness to XML, filtering might be helpful there, too, albeit to a lesser degree. + * </p> + * <p> + * Examples include null bytes (0x0), control characters (0x1C "file separator"), surrogates or non-characters. + * If a filter policy is used, characters that fail to pass will be replaced by 0xFFFD (�) in the output. + * </p> + * <p> + * Available policies are: + * <dl> + * <dt>NONE (Default)</dt> + * <dd>No filter is applied to the output</dd> + * <dt>XML_ALL</dt> + * <dd> + * Allow all characters, that are neccessarily supported. According to the XML spec.<br> + * Given as #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] ( | [#x10000-#x10FFFF] )<br> + * (as of Aug. 2020) + * </dd> + * <dt>XML_STRICT</dt> + * <dd> + * Filter out none-supported <it>and</it> discouraged characters, according to XML spec. + * </dd> + * </dl> + * </p> + * @param characterFilter character policy to use + */ + public void setCharacterFilter(CharFilter characterFilter) { this.characterFilter = characterFilter; } + protected IndentPrinter getPrinter() { return this.out; } @@ -391,21 +433,26 @@ public class MarkupBuilder extends BuilderSupport { private String escapeXmlValue(String value, boolean isAttrValue) { if (value == null) throw new IllegalArgumentException(); - return StringGroovyMethods.collectReplacements(value, new ReplacingClosure(isAttrValue, useDoubleQuotes)); + return StringGroovyMethods.collectReplacements(value, new ReplacingClosure(isAttrValue, useDoubleQuotes, characterFilter)); } private static class ReplacingClosure extends Closure<String> { private final boolean isAttrValue; private final boolean useDoubleQuotes; + private final CharFilter characterFilter; - public ReplacingClosure(boolean isAttrValue, boolean useDoubleQuotes) { + public ReplacingClosure(boolean isAttrValue, boolean useDoubleQuotes, CharFilter characterFilter) { super(null); this.isAttrValue = isAttrValue; this.useDoubleQuotes = useDoubleQuotes; + this.characterFilter = characterFilter; } public String doCall(Character ch) { switch (ch) { + case 0: + if (characterFilter != CharFilter.NONE) return "\uFFFD"; + break; case '&': return "&"; case '<': @@ -435,8 +482,22 @@ public class MarkupBuilder extends BuilderSupport { if (isAttrValue && !useDoubleQuotes) return "'"; break; } + if (characterFilter != CharFilter.NONE) { + if (Character.isSurrogate(ch) + || ch < 127 && ch != 9 && ch != 10 && ch != 12 && ch != 13) { + return "\uFFFD"; + } + } + if (characterFilter == CharFilter.XML_STRICT) { + if (Character.isISOControl(ch) || isNonCharacter(ch)) return "\uFFFD"; + } return null; } + + private boolean isNonCharacter(char ch) { + return 0xFDD0 <= ch && ch <= 0xFDEF + || ((ch % 0x10000 ^ 0xFFFE) == 0 || (ch % 0x10000 ^ 0xFFFF) == 0) && ch >> 16 <= 0x10; + } } private void toState(int next, Object name) { diff --git a/subprojects/groovy-xml/src/test/groovy/groovy/xml/MarkupBuilderIllegalCharactersSpec.groovy b/subprojects/groovy-xml/src/test/groovy/groovy/xml/MarkupBuilderIllegalCharactersSpec.groovy new file mode 100644 index 0000000..d978d9c --- /dev/null +++ b/subprojects/groovy-xml/src/test/groovy/groovy/xml/MarkupBuilderIllegalCharactersSpec.groovy @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package groovy.xml + +import spock.lang.Specification + +class MarkupBuilderIllegalCharactersSpec extends Specification { + static final char[] nullCharacter = [0x0] + static final char[] c0Controls = ((0x0..0x8) + [0xB] + (0xE..0x1F)) + static final char[] extControl1 = (0x7F..0x84) + static final char[] nextLine = [0x85] + static final char[] extControl2 = (0x86..0x9F) + static final char[] surrogates = (0xD800..0xDFFF) + static final char[] nonCharacters1 = (0xFDD0..0xFDEF) + static final char[] nonCharacters2 = [0xFFFE, 0xFFFF] + + def "Default MarkupBuilder character filter for #characterRange"(String characterRange, boolean shouldFilter, char[] characters) { + given: + def writer + def xml + def characterFilter = MarkupBuilder.CharFilter.XML_STRICT + + expect: + characters.each { + writer = new StringWriter() + xml = new MarkupBuilder(writer) + xml.characterFilter = characterFilter + def encoded = shouldFilter ? '\uFFFD' : it + + xml.tag(attr: it, it) + def actual = writer.toString() + + assert actual == "<tag attr='$encoded'>$encoded</tag>", + "Character (${it as int}) is encoded correctly" + } + + where: + characterRange | shouldFilter | characters + 'Null' | true | nullCharacter // Not neccessarily XML, not allowed in HTML + 'C0 control w/o whitespace' | true | c0Controls // Not neccessarily XML, not in HTML char references + 'ext control I' | true | extControl1 // Discouraged XML, not in HTML char references + 'Next line NEL' | true | nextLine // Not in HTML char references + 'ext control II' | true | extControl2 // Discouraged XML, not in HTML char references + 'Surrogates' | true | surrogates // Not neccessarily XML, not in HTML char references + 'Non-characters I' | true | nonCharacters1 // Discouraged XML, not in HTML char references + 'Non-characters II' | true | nonCharacters2 // Discouraged XML, not in HTML char references + } + + def "MarkupBuilder ALL_XML character filter for #characterRange"(String characterRange, boolean shouldFilter, char[] characters) { + given: + def writer + def xml + def characterFilter = MarkupBuilder.CharFilter.XML_ALL + + expect: + characters.each { + writer = new StringWriter() + xml = new MarkupBuilder(writer) + xml.characterFilter = characterFilter + def encoded = shouldFilter ? '\uFFFD' : it + + xml.tag(attr: it, it) + def actual = writer.toString() + + assert actual == "<tag attr='$encoded'>$encoded</tag>", + "Character (${it as int}) is encoded correctly" + } + + where: + characterRange | shouldFilter | characters + 'Null' | true | nullCharacter // Not neccessarily XML, not allowed in HTML + 'C0 control w/o whitespace' | true | c0Controls // Not neccessarily XML, not in HTML char references + 'ext control I' | false | extControl1 // Discouraged XML, not in HTML char references + 'Next line NEL' | false | nextLine // Not in HTML char references + 'ext control II' | false | extControl2 // Discouraged XML, not in HTML char references + 'Surrogates' | true | surrogates // Not neccessarily XML, not in HTML char references + 'Non-characters I' | false | nonCharacters1 // Discouraged XML, not in HTML char references + 'Non-characters II' | false | nonCharacters2 // Discouraged XML, not in HTML char references + } + + def "MarkupBuilder NONE character filter for #characterRange"(String characterRange, boolean shouldFilter, char[] characters) { + given: + def writer + def xml + def characterFilter = MarkupBuilder.CharFilter.NONE + + expect: + characters.each { + writer = new StringWriter() + xml = new MarkupBuilder(writer) + xml.characterFilter = characterFilter + def encoded = shouldFilter ? '\uFFFD' : it + + xml.tag(attr: it, it) + def actual = writer.toString() + + assert actual == "<tag attr='$encoded'>$encoded</tag>", + "Character (${it as int}) is encoded correctly" + } + + where: + characterRange | shouldFilter | characters + 'Null' | false | nullCharacter // Not neccessarily XML, not allowed in HTML + 'C0 control w/o whitespace' | false | c0Controls // Not neccessarily XML, not in HTML char references + 'ext control I' | false | extControl1 // Discouraged XML, not in HTML char references + 'Next line NEL' | false | nextLine // Not in HTML char references + 'ext control II' | false | extControl2 // Discouraged XML, not in HTML char references + 'Surrogates' | false | surrogates // Not neccessarily XML, not in HTML char references + 'Non-characters I' | false | nonCharacters1 // Discouraged XML, not in HTML char references + 'Non-characters II' | false | nonCharacters2 // Discouraged XML, not in HTML char references + } +}
