[groovy] 01/04: Add character filter to MarkupBuilder

paulk Thu, 17 Sep 2020 06:29:42 -0700

This is an automated email from the ASF dual-hosted git repository.

paulk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/groovy.git


commit 7039d6a6c9ad2c7f0aba832b57a7317ba867004b
Author: Simon Tost <[email protected]>
AuthorDate: Sun Aug 30 20:40:21 2020 +0200

    Add character filter to MarkupBuilder
---
 subprojects/groovy-xml/build.gradle                |   7 ++
 .../src/main/java/groovy/xml/MarkupBuilder.java    |  65 ++++++++++-
 .../xml/MarkupBuilderIllegalCharactersSpec.groovy  | 128 +++++++++++++++++++++
 3 files changed, 198 insertions(+), 2 deletions(-)

diff --git a/subprojects/groovy-xml/build.gradle 
b/subprojects/groovy-xml/build.gradle
index 2528544..c37fdd3 100644
--- a/subprojects/groovy-xml/build.gradle
+++ b/subprojects/groovy-xml/build.gradle
@@ -21,6 +21,13 @@ dependencies {
     testImplementation rootProject.sourceSets.test.runtimeClasspath
     testImplementation "xmlunit:xmlunit:$xmlunitVersion"
     testImplementation project(':groovy-test')
+    testImplementation ("org.spockframework:spock-core:$spockVersion") {
+        exclude group: 'org.codehaus.groovy'
+    }
+}
+
+test {
+    useJUnitPlatform()
 }
 
 task moduleDescriptor(type: 
org.codehaus.groovy.gradle.WriteExtensionDescriptorTask) {
diff --git a/subprojects/groovy-xml/src/main/java/groovy/xml/MarkupBuilder.java 
b/subprojects/groovy-xml/src/main/java/groovy/xml/MarkupBuilder.java
index b7d028e..bdbde82 100644
--- a/subprojects/groovy-xml/src/main/java/groovy/xml/MarkupBuilder.java
+++ b/subprojects/groovy-xml/src/main/java/groovy/xml/MarkupBuilder.java
@@ -61,6 +61,8 @@ import java.util.Map;
  * </ul>
  */
 public class MarkupBuilder extends BuilderSupport {
+    public enum CharFilter { XML_STRICT, XML_ALL, NONE }
+
     private IndentPrinter out;
     private boolean nospace;
     private int state;
@@ -70,6 +72,7 @@ public class MarkupBuilder extends BuilderSupport {
     private boolean omitEmptyAttributes = false;
     private boolean expandEmptyElements = false;
     private boolean escapeAttributes = true;
+    private CharFilter characterFilter = CharFilter.NONE;
 
     /**
      * Returns the escapeAttributes property value.
@@ -222,6 +225,45 @@ public class MarkupBuilder extends BuilderSupport {
         this.expandEmptyElements = expandEmptyElements;
     }
 
+    /**
+     * Returns the current character filter.
+     *
+     * @return the character filter used by this builder.
+     */
+    public CharFilter getCharacterFilter() { return this.characterFilter; }
+
+    /**
+     * Set a filter to limit the characters, that can appear in attribute 
values and text nodes.
+     * <p>
+     *     Some unicode character are either not allowed, discouraged or not 
referenceable  with an escape sequence
+     *     by specification. Especially XML parsers might have trouble dealing 
with some of those characters.
+     *     Since HTML strives for closeness to XML, filtering might be helpful 
there, too, albeit to a lesser degree.
+     * </p>
+     * <p>
+     *     Examples include null bytes (0x0), control characters (0x1C "file 
separator"), surrogates or non-characters.
+     *     If a filter policy is used, characters that fail to pass will be 
replaced by 0xFFFD (&#xFFFD;) in the output.
+     * </p>
+     * <p>
+     *     Available policies are:
+     *     <dl>
+     *         <dt>NONE (Default)</dt>
+     *         <dd>No filter is applied to the output</dd>
+     *         <dt>XML_ALL</dt>
+     *         <dd>
+     *             Allow all characters, that are neccessarily supported. 
According to the XML spec.<br>
+     *             Given as #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] 
( | [#x10000-#x10FFFF] )<br>
+     *             (as of Aug. 2020)
+     *         </dd>
+     *         <dt>XML_STRICT</dt>
+     *         <dd>
+     *             Filter out none-supported <it>and</it> discouraged 
characters, according to XML spec.
+     *         </dd>
+     *     </dl>
+     * </p>
+     * @param characterFilter character policy to use
+     */
+    public void setCharacterFilter(CharFilter characterFilter) { 
this.characterFilter = characterFilter; }
+
     protected IndentPrinter getPrinter() {
         return this.out;
     }
@@ -391,21 +433,26 @@ public class MarkupBuilder extends BuilderSupport {
     private String escapeXmlValue(String value, boolean isAttrValue) {
         if (value == null)
             throw new IllegalArgumentException();
-        return StringGroovyMethods.collectReplacements(value, new 
ReplacingClosure(isAttrValue, useDoubleQuotes));
+        return StringGroovyMethods.collectReplacements(value, new 
ReplacingClosure(isAttrValue, useDoubleQuotes, characterFilter));
     }
 
     private static class ReplacingClosure extends Closure<String> {
         private final boolean isAttrValue;
         private final boolean useDoubleQuotes;
+        private final CharFilter characterFilter;
 
-        public ReplacingClosure(boolean isAttrValue, boolean useDoubleQuotes) {
+        public ReplacingClosure(boolean isAttrValue, boolean useDoubleQuotes, 
CharFilter characterFilter) {
             super(null);
             this.isAttrValue = isAttrValue;
             this.useDoubleQuotes = useDoubleQuotes;
+            this.characterFilter = characterFilter;
         }
 
         public String doCall(Character ch) {
             switch (ch) {
+                case 0:
+                    if (characterFilter != CharFilter.NONE) return "\uFFFD";
+                    break;
                 case '&':
                     return "&amp;";
                 case '<':
@@ -435,8 +482,22 @@ public class MarkupBuilder extends BuilderSupport {
                     if (isAttrValue && !useDoubleQuotes) return "&apos;";
                     break;
             }
+            if (characterFilter != CharFilter.NONE) {
+                if (Character.isSurrogate(ch)
+                        || ch < 127 && ch !=  9 && ch != 10 && ch != 12 && ch 
!= 13) {
+                    return "\uFFFD";
+                }
+            }
+            if (characterFilter == CharFilter.XML_STRICT) {
+                if (Character.isISOControl(ch) || isNonCharacter(ch))  return 
"\uFFFD";
+            }
             return null;
         }
+
+        private boolean isNonCharacter(char ch) {
+            return 0xFDD0 <= ch && ch <= 0xFDEF
+                    || ((ch % 0x10000 ^ 0xFFFE) == 0 || (ch % 0x10000 ^ 
0xFFFF) == 0) && ch >> 16 <= 0x10;
+        }
     }
 
     private void toState(int next, Object name) {
diff --git 
a/subprojects/groovy-xml/src/test/groovy/groovy/xml/MarkupBuilderIllegalCharactersSpec.groovy
 
b/subprojects/groovy-xml/src/test/groovy/groovy/xml/MarkupBuilderIllegalCharactersSpec.groovy
new file mode 100644
index 0000000..d978d9c
--- /dev/null
+++ 
b/subprojects/groovy-xml/src/test/groovy/groovy/xml/MarkupBuilderIllegalCharactersSpec.groovy
@@ -0,0 +1,128 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ */
+package groovy.xml
+
+import spock.lang.Specification
+
+class MarkupBuilderIllegalCharactersSpec extends Specification {
+    static final char[] nullCharacter  = [0x0]
+    static final char[] c0Controls     = ((0x0..0x8) + [0xB] + (0xE..0x1F))
+    static final char[] extControl1    = (0x7F..0x84)
+    static final char[] nextLine       = [0x85]
+    static final char[] extControl2    = (0x86..0x9F)
+    static final char[] surrogates     = (0xD800..0xDFFF)
+    static final char[] nonCharacters1 = (0xFDD0..0xFDEF)
+    static final char[] nonCharacters2 = [0xFFFE, 0xFFFF]
+
+    def "Default MarkupBuilder character filter for #characterRange"(String 
characterRange, boolean shouldFilter,  char[] characters) {
+        given:
+        def writer
+        def xml
+        def characterFilter = MarkupBuilder.CharFilter.XML_STRICT
+
+        expect:
+        characters.each {
+            writer = new StringWriter()
+            xml = new MarkupBuilder(writer)
+            xml.characterFilter = characterFilter
+            def encoded = shouldFilter ? '\uFFFD' : it
+
+            xml.tag(attr: it, it)
+            def actual = writer.toString()
+
+            assert actual == "<tag attr='$encoded'>$encoded</tag>",
+                    "Character (${it as int}) is encoded correctly"
+        }
+
+        where:
+        characterRange              | shouldFilter | characters
+        'Null'                      | true         | nullCharacter         // 
Not neccessarily XML, not allowed in HTML
+        'C0 control w/o whitespace' | true         | c0Controls            // 
Not neccessarily XML, not in HTML char references
+        'ext control I'             | true         | extControl1           // 
Discouraged XML, not in HTML char references
+        'Next line NEL'             | true         | nextLine              // 
Not in HTML char references
+        'ext control II'            | true         | extControl2           // 
Discouraged XML, not in HTML char references
+        'Surrogates'                | true         | surrogates            // 
Not neccessarily XML, not in HTML char references
+        'Non-characters I'          | true         | nonCharacters1        // 
Discouraged XML, not in HTML char references
+        'Non-characters II'         | true         | nonCharacters2        // 
Discouraged XML, not in HTML char references
+    }
+
+    def "MarkupBuilder ALL_XML character filter for #characterRange"(String 
characterRange, boolean shouldFilter,  char[] characters) {
+        given:
+        def writer
+        def xml
+        def characterFilter = MarkupBuilder.CharFilter.XML_ALL
+
+        expect:
+        characters.each {
+            writer = new StringWriter()
+            xml = new MarkupBuilder(writer)
+            xml.characterFilter = characterFilter
+            def encoded = shouldFilter ? '\uFFFD' : it
+
+            xml.tag(attr: it, it)
+            def actual = writer.toString()
+
+            assert actual == "<tag attr='$encoded'>$encoded</tag>",
+                    "Character (${it as int}) is encoded correctly"
+        }
+
+        where:
+        characterRange              | shouldFilter | characters
+        'Null'                      | true         | nullCharacter         // 
Not neccessarily XML, not allowed in HTML
+        'C0 control w/o whitespace' | true         | c0Controls            // 
Not neccessarily XML, not in HTML char references
+        'ext control I'             | false        | extControl1           // 
Discouraged XML, not in HTML char references
+        'Next line NEL'             | false        | nextLine              // 
Not in HTML char references
+        'ext control II'            | false        | extControl2           // 
Discouraged XML, not in HTML char references
+        'Surrogates'                | true         | surrogates            // 
Not neccessarily XML, not in HTML char references
+        'Non-characters I'          | false        | nonCharacters1        // 
Discouraged XML, not in HTML char references
+        'Non-characters II'         | false        | nonCharacters2        // 
Discouraged XML, not in HTML char references
+    }
+
+    def "MarkupBuilder NONE character filter for #characterRange"(String 
characterRange, boolean shouldFilter, char[] characters) {
+        given:
+        def writer
+        def xml
+        def characterFilter = MarkupBuilder.CharFilter.NONE
+
+        expect:
+        characters.each {
+            writer = new StringWriter()
+            xml = new MarkupBuilder(writer)
+            xml.characterFilter = characterFilter
+            def encoded = shouldFilter ? '\uFFFD' : it
+
+            xml.tag(attr: it, it)
+            def actual = writer.toString()
+
+            assert actual == "<tag attr='$encoded'>$encoded</tag>",
+                    "Character (${it as int}) is encoded correctly"
+        }
+
+        where:
+        characterRange              | shouldFilter | characters
+        'Null'                      | false        | nullCharacter         // 
Not neccessarily XML, not allowed in HTML
+        'C0 control w/o whitespace' | false        | c0Controls            // 
Not neccessarily XML, not in HTML char references
+        'ext control I'             | false        | extControl1           // 
Discouraged XML, not in HTML char references
+        'Next line NEL'             | false        | nextLine              // 
Not in HTML char references
+        'ext control II'            | false        | extControl2           // 
Discouraged XML, not in HTML char references
+        'Surrogates'                | false        | surrogates            // 
Not neccessarily XML, not in HTML char references
+        'Non-characters I'          | false        | nonCharacters1        // 
Discouraged XML, not in HTML char references
+        'Non-characters II'         | false        | nonCharacters2        // 
Discouraged XML, not in HTML char references
+    }
+}

[groovy] 01/04: Add character filter to MarkupBuilder

Reply via email to