This is an automated email from the ASF dual-hosted git repository. afs pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/jena.git
commit 0715a32b28fd3828804da95d5d9bbc1663fc8004 Author: Andy Seaborne <[email protected]> AuthorDate: Sun Apr 19 13:09:46 2026 +0100 GH-3868: Output illegal surrogates as \uFFFD --- .../java/org/apache/jena/atlas/lib/EscapeStr.java | 32 +++++++++++++++++++++- .../org/apache/jena/atlas/lib/TestEscapeStr.java | 21 +++++++++++++- .../apache/jena/shex/runner/RunnerPrintShex.java | 2 +- 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/jena-base/src/main/java/org/apache/jena/atlas/lib/EscapeStr.java b/jena-base/src/main/java/org/apache/jena/atlas/lib/EscapeStr.java index 298b7a9e70..5c2dde137d 100644 --- a/jena-base/src/main/java/org/apache/jena/atlas/lib/EscapeStr.java +++ b/jena-base/src/main/java/org/apache/jena/atlas/lib/EscapeStr.java @@ -135,10 +135,40 @@ public class EscapeStr } // Normal case! - out.print(c); + if ( ! Character.isSurrogate(c) ) { + out.print(c); + continue; + } + + // Surrogate. Check if high-low (legal). + + if ( i < len-1 ) { + // Peek + char c2 = s.charAt(i+1); + if ( Character.isSurrogatePair(c, c2)) { + // Accept c2 + i++; + // Valid surrogate pair! + // Print both surrogates raw, and let the character encoder deal with it. + out.print(c); + out.print(c2); + continue; + } + // The next character could be a low surrogate making this a lone-low surrogate then legal + } + // Bad surrogate. low-high, or end of string. + // c2 is read again. + outputReplacement(out, c); + continue; } } + private static void outputReplacement(AWriter out, char c) { + // This is our policy instead of Java's default of a single '?' (done deep inside UTF_8.Encoder) + //out.printf("\\u%04X", (int)c); + out.print("\\uFFFD"); + } + /** Write a string with Unicode to ASCII conversion using \-u escapes */ public static void writeASCII(AWriter out, String s) { int len = s.length(); diff --git a/jena-base/src/test/java/org/apache/jena/atlas/lib/TestEscapeStr.java b/jena-base/src/test/java/org/apache/jena/atlas/lib/TestEscapeStr.java index 7a7a4b9c4a..bac02e4dd5 100644 --- a/jena-base/src/test/java/org/apache/jena/atlas/lib/TestEscapeStr.java +++ b/jena-base/src/test/java/org/apache/jena/atlas/lib/TestEscapeStr.java @@ -141,6 +141,10 @@ public class TestEscapeStr { assertEquals(expected, output); } + private void test_escape(String input, String expected) { + String output = EscapeStr.stringEsc(input); + assertEquals(expected, output); + } @Test public void unescape_unicode_1() { test_unesc_unicode("", "") ; } @Test public void unescape_unicode_2() { test_unesc_unicode("abc\\u0020def", "abc def") ; } @Test public void unescape_unicode_3() { test_unesc_unicode("\\u0020", " ") ; } @@ -153,7 +157,6 @@ public class TestEscapeStr { @Test public void unescape_unicode_12() { test_unesc_unicode("\\(\\)", "\\(\\)") ; } @Test public void unescape_unicode_13() { test_unesc_unicode("\\\\", "\\\\") ; } - // See also TestEscapeStr // \-u{...} style Unicode escapes @Test public void unescape_unicode_20() { test_unesc_unicode("\\u{41}", "A") ; } @Test public void unescape_unicode_21() { test_unesc_unicode("\\u{000000}", "\u0000") ; } @@ -168,6 +171,22 @@ public class TestEscapeStr { @Test public void unescape_unicode_33() { assertThrows(AtlasException.class, ()->test_unesc_unicode("\\u{1234567}", "")) ; } @Test public void unescape_unicode_34() { assertThrows(AtlasException.class, ()->test_unesc_unicode("\\u{0000000}", "")) ; } + // Escaped surrogates, good and bad. + // Use java character escapes to put the surrogates into the java string. + // 🂡 is U+D83C U+DCA1 + @Test public void escape_unicode_50() { test_escape("\uD83C\uDCA1", "🂡"); } + @Test public void escape_unicode_51() { test_escape("abc\uD83C\uDCA1xyz", "abc🂡xyz"); } + // low, then high -> illegal + @Test public void escape_unicode_55() { test_escape("\uDCA1\uD83C", "\\uFFFD\\uFFFD"); } + @Test public void escape_unicode_56() { test_escape("\uDCA1\uD83C@", "\\uFFFD\\uFFFD@"); } + // Lone surrogate + @Test public void escape_unicode_60() { test_escape("\uD83C", "\\uFFFD"); } + @Test public void escape_unicode_61() { test_escape("abc\uD83Cxyz", "abc\\uFFFDxyz"); } + @Test public void escape_unicode_62() { test_escape("\uDCA1", "\\uFFFD"); } + @Test public void escape_unicode_63() { test_escape("abc\uDCA1xyz", "abc\\uFFFDxyz"); } + + // low, then high/low -> one illegal, encode legal pair. + @Test public void escape_unicode_59() { test_escape("\uDCA1\uD83C\uDCA1", "\\uFFFD🂡"); } private void test_unesc_unicode(String input, String expected) { String output = EscapeStr.unescapeUnicode(input) ; diff --git a/jena-shex/src/test/java/org/apache/jena/shex/runner/RunnerPrintShex.java b/jena-shex/src/test/java/org/apache/jena/shex/runner/RunnerPrintShex.java index b9aa9f7504..7148a348bc 100644 --- a/jena-shex/src/test/java/org/apache/jena/shex/runner/RunnerPrintShex.java +++ b/jena-shex/src/test/java/org/apache/jena/shex/runner/RunnerPrintShex.java @@ -109,7 +109,7 @@ public class RunnerPrintShex extends org.apache.jena.shex.runner.AbstractRunnerF System.out.println("-- --"); Shex.printSchema(System.out, schema2); System.out.println("== =="); - fail("ShEx schames not equivalent"); + fail("ShEx schemas not equivalent"); } }
