(jena) 06/07: GH-3281: Update TokenizerText to check RDF Strings

andy Mon, 30 Jun 2025 04:01:54 -0700

This is an automated email from the ASF dual-hosted git repository.

andy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/jena.git


commit 530d0a727cf05641372f55efbe9b5ce2c7d8ce79
Author: Andy Seaborne <[email protected]>
AuthorDate: Thu Jun 26 19:07:50 2025 +0100

    GH-3281: Update TokenizerText to check RDF Strings
---
 .../org/apache/jena/riot/tokens/TokenizerText.java | 290 +++++++++++----------
 .../jena/riot/tokens/TokenizerTextBuilder.java     |   2 +-
 .../org/apache/jena/riot/lang/TestLangTurtle.java  | 167 ++++++++----
 .../apache/jena/riot/tokens/TestTokenizerText.java | 129 +++++++++
 .../java/org/apache/jena/atlas/io/CharStream.java  |   1 -
 .../java/org/apache/jena/atlas/io/PeekReader.java  |   7 +-
 .../jena/tdb1/store/nodetable/TestNodec.java       |   6 +-
 7 files changed, 407 insertions(+), 195 deletions(-)

diff --git 
a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java 
b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
index 9e315870c6..52547b2d50 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
@@ -43,10 +43,13 @@ public final class TokenizerText implements Tokenizer
     // do the check later in the parsing process. In case a need arises, the 
code
     // remains, all compiled away by "if ( false )" (javac does not generate 
any
     // bytecodes and even if it it did, JIT will remove dead branches).
-    private static final boolean Checking = false;
+    private static final boolean CHECKING = false;
     // Optional checker.
     private final TokenChecker checker = null;
 
+    // Whether to check for legal RDF strings (no ill formed use of surrogates)
+    private static final boolean CHECK_RDFSTRING = true;
+
     // Workspace for building token images.
     // Reusing a StringBuilder is faster than allocating a fresh one each time.
     private final StringBuilder stringBuilder = new StringBuilder(200);
@@ -55,8 +58,6 @@ public final class TokenizerText implements Tokenizer
     private final PeekReader reader;
     // Whether whitespace between tokens includes newlines (in various forms).
     private final boolean singleLineMode;
-    // Indicator. The PeekReader should throw 
java.nio.charset.MalformedInputException
-    private final boolean isASCII;
     // The code assumes that errors throw exception and so stop parsing.
     private final ErrorHandler errorHandler;
 
@@ -67,15 +68,14 @@ public final class TokenizerText implements Tokenizer
 
     public static Tokenizer fromString(String string) { return 
create().fromString(string).build(); }
 
-    /*package*/ static TokenizerText internal(PeekReader reader, boolean 
singleLineMode, boolean isASCII, ErrorHandler errorHandler) {
-        return new TokenizerText(reader, singleLineMode, isASCII, 
errorHandler);
+    /*package*/ static TokenizerText internal(PeekReader reader, boolean 
singleLineMode, ErrorHandler errorHandler) {
+        return new TokenizerText(reader, singleLineMode, errorHandler);
     }
 
-    private TokenizerText(PeekReader reader, boolean singleLineMode, boolean 
isASCII, ErrorHandler errorHandler) {
+    private TokenizerText(PeekReader reader, boolean singleLineMode, 
ErrorHandler errorHandler) {
         this.reader = Objects.requireNonNull(reader, "PeekReader");
         this.singleLineMode = singleLineMode;
         this.errorHandler = Objects.requireNonNull(errorHandler, 
"ErrorHandler");
-        this.isASCII = isASCII;
     }
 
     @Override
@@ -183,7 +183,7 @@ public final class TokenizerText implements Tokenizer
                 // '<' not '<<'
                 token.setImage(readIRI());
                 token.setType(TokenType.IRI);
-                if ( Checking )
+                if ( CHECKING )
                     checkURI(token.getImage());
                 return token;
             }
@@ -215,7 +215,7 @@ public final class TokenizerText implements Tokenizer
                 int ch3 = reader.peekChar();
                 if ( ch3 == ch ) {
                     reader.readChar();     // Read potential third quote.
-                    token.setImage(readStringQuote3(ch, false));
+                    token.setImage(readStringQuote3(ch));
                     StringType st = (ch == CH_QUOTE1) ? 
StringType.LONG_STRING1 : StringType.LONG_STRING2;
                     token.setStringType(st);
                 } else {
@@ -224,8 +224,7 @@ public final class TokenizerText implements Tokenizer
                     // No need to pushback characters as we know the lexical
                     // form is the empty string.
                     // if ( ch2 != EOF ) reader.pushbackChar(ch2);
-                    // if ( ch1 != EOF ) reader.pushbackChar(ch1); // Must be
-                    // '' or ""
+                    // if ( ch1 != EOF ) reader.pushbackChar(ch1);
                     token.setImage("");
                     StringType st = (ch == CH_QUOTE1) ? StringType.STRING1 : 
StringType.STRING2;
                     token.setStringType(st);
@@ -251,7 +250,7 @@ public final class TokenizerText implements Tokenizer
                 mainToken.setSubToken1(token);
                 mainToken.setImage2(langTag());
                 token = mainToken;
-                if ( Checking )
+                if ( CHECKING )
                     checkLiteralLang(token.getImage(), token.getImage2());
             } else if ( reader.peekChar() == '^' ) {
                 expect("^^");
@@ -276,11 +275,11 @@ public final class TokenizerText implements Tokenizer
                 mainToken.setType(TokenType.LITERAL_DT);
 
                 token = mainToken;
-                if ( Checking )
+                if ( CHECKING )
                     checkLiteralDT(token.getImage(), subToken);
             } else {
                 // Was a simple string.
-                if ( Checking )
+                if ( CHECKING )
                     checkString(token.getImage());
             }
             return token;
@@ -293,7 +292,7 @@ public final class TokenizerText implements Tokenizer
                 reader.readChar();
                 token.setImage(readBlankNodeLabel());
                 token.setType(TokenType.BNODE);
-                if ( Checking ) checkBlankNode(token.getImage());
+                if ( CHECKING ) checkBlankNode(token.getImage());
                 return token;
             }
             token.setType(TokenType.UNDERSCORE);
@@ -306,7 +305,7 @@ public final class TokenizerText implements Tokenizer
             reader.readChar();
             token.setType(TokenType.DIRECTIVE);
             token.setImage(readWord(false));
-            if ( Checking )
+            if ( CHECKING )
                 checkDirective(token.getImage());
             return token;
         }
@@ -317,7 +316,7 @@ public final class TokenizerText implements Tokenizer
             token.setType(TokenType.VAR);
             // Character set?
             token.setImage(readVarName());
-            if ( Checking )
+            if ( CHECKING )
                 checkVariable(token.getImage());
             return token;
         }
@@ -336,7 +335,7 @@ public final class TokenizerText implements Tokenizer
                     reader.pushbackChar(CH_DOT);
                     boolean charactersConsumed = readNumber(CH_ZERO, false);
                     if ( charactersConsumed ) {
-                        if ( Checking )
+                        if ( CHECKING )
                             checkNumber(token.getImage(), token.getImage2());
                         return token;
                     }
@@ -517,7 +516,7 @@ public final class TokenizerText implements Tokenizer
 
         readPrefixedNameOrKeyword(token);
 
-        if ( Checking ) checkKeyword(token.getImage());
+        if ( CHECKING ) checkKeyword(token.getImage());
         return token;
     }
 
@@ -539,17 +538,14 @@ public final class TokenizerText implements Tokenizer
                     fatal("Broken IRI (CR): %s", stringBuilder.toString()); 
return null;
                 case CH_GT:
                     // Done!
-                    return stringBuilder.toString();
+                    String str = stringBuilder.toString();
+                    if ( CHECK_RDFSTRING )
+                        checkRDFString(str);
+                    return str;
                 case CH_RSLASH:
-                    if ( VeryVeryLaxIRI )
-                        // Includes unicode escapes and also \n etc
-                        ch = readLiteralEscape();
-                    else
-                        // NORMAL
-                        ch = readUnicodeEscape();
+                    ch = readUnicodeEscape();
                     // Don't check legality of ch (strict syntax at this 
point).
-                    // That does not mean it is a good idea to bypass checking.
-                    // Bad characters will lead to trouble elsewhere.
+                    // IRI parsing will catch errors.
                     break;
                 case CH_LT:
                     // Probably a corrupt file so treat as fatal.
@@ -558,7 +554,7 @@ public final class TokenizerText implements Tokenizer
                     error("Bad character in IRI (tab character): 
<%s[tab]...>", stringBuilder.toString()); break;
                 case '{': case '}': case '"': case '|': case '^': case '`' :
                     if ( ! VeryVeryLaxIRI )
-                        warning("Illegal character in IRI (codepoint 0x%02X, 
'%c'): <%s[%c]...>", ch, (char)ch, stringBuilder.toString(), (char)ch);
+                        warning("Illegal character in IRI (codepoint U+%04X, 
'%c'): <%s[%c]...>", ch, (char)ch, stringBuilder.toString(), (char)ch);
                     break;
                 case SPC:
                     if ( ! AllowSpacesInIRI )
@@ -578,21 +574,6 @@ public final class TokenizerText implements Tokenizer
         }
     }
 
-    // Read a unicode escape : does not allow \\ bypass
-    private final int readUnicodeEscape() {
-        int ch = reader.readChar();
-        if ( ch == EOF )
-            fatal("Broken escape sequence");
-
-        switch (ch) {
-            case 'u': return readUnicode4Escape();
-            case 'U': return readUnicode8Escape();
-            default:
-                fatal("Illegal unicode escape sequence value: \\%c (0x%02X)", 
ch, ch);
-        }
-        return 0;
-    }
-
     private void readPrefixedNameOrKeyword(Token token) {
         long posn = reader.getPosition();
         String prefixPart = readPrefixPart(); // Prefix part or keyword
@@ -604,7 +585,7 @@ public final class TokenizerText implements Tokenizer
             token.setType(TokenType.PREFIXED_NAME);
             String ln = readLocalPart(); // Local part
             token.setImage2(ln);
-            if ( Checking )
+            if ( CHECKING )
                 checkPrefixedName(token.getImage(), token.getImage2());
         }
 
@@ -613,7 +594,7 @@ public final class TokenizerText implements Tokenizer
         if ( posn == reader.getPosition() )
             fatal("Failed to find a prefix name or keyword: %c(%d;0x%04X)", 
ch, ch, ch);
 
-        if ( Checking )
+        if ( CHECKING )
             checkKeyword(token.getImage());
     }
 
@@ -759,40 +740,64 @@ public final class TokenizerText implements Tokenizer
     }
 
     // Process PLX (percent or character escape for a prefixed name)
-    private void processPLX(int ch)
-    {
-        if ( ch == CH_PERCENT )
-        {
+    private void processPLX(int ch) {
+        if ( ch == CH_PERCENT ) {
             insertCodepointDirect(stringBuilder, ch);
 
             ch = reader.peekChar();
-            if ( ! isHexChar(ch) )
-                fatal("Not a hex character: '%c'",ch);
+            if ( !isHexChar(ch) )
+                fatal("Not a hex character: '%c'", ch);
             insertCodepointDirect(stringBuilder, ch);
             reader.readChar();
 
             ch = reader.peekChar();
-            if ( ! isHexChar(ch) )
-                fatal("Not a hex character: '%c'",ch);
+            if ( !isHexChar(ch) )
+                fatal("Not a hex character: '%c'", ch);
             insertCodepointDirect(stringBuilder, ch);
             reader.readChar();
-        }
-        else if ( ch == CH_RSLASH )
-        {
+        } else if ( ch == CH_RSLASH ) {
             ch = readCharEscape();
             insertCodepoint(stringBuilder, ch);
-        }
-        else
+        } else
             throw new ARQInternalErrorException("Not a '\\' or a '%' 
character");
     }
 
+    /**
+     * Apply any checks for "RDF String" to a string that has already had 
escape processing applied.
+     * An RDF String is a sequence of codepoints in the range U+0000 to 
U+10FFFF, excluding surrogates.
+     * Because this is java, we test for no non-paired surrogates.
+     * A surrogate pair is high-low.
+     * This check is performed in readIRI, readStrignQuote1, and 
readStringQuote3
+     */
+    private void checkRDFString(String string) {
+        for ( int i = 0 ; i < string.length() ; i++ ) {
+            // Not "codePointAt" which does surrogate processing.
+            char ch = string.charAt(i);
+
+            if ( ! Character.isValidCodePoint(ch) )
+                warning("Illegal code point in \\U sequence value: 0x%08X", 
ch);
+
+            // Check surrogate pairs are pairs.
+            if ( Character.isHighSurrogate(ch) ) {
+                i++;
+                if ( i == string.length() )
+                    fatal("Bad surrogate pair (end of string)");
+                char ch1 = string.charAt(i);
+                if ( ! Character.isLowSurrogate(ch1) ) {
+                    fatal("Bad surrogate pair (high surrogate not followed by 
low surrogate)");
+                }
+            } else if ( Character.isLowSurrogate(ch) ) {
+                fatal("Bad surrogate pair (low surrogate not preceded by a 
high surrogate)");
+            }
+        }
+    }
+
     // Get characters between two markers.
-    // strEscapes may be processed
+    // String escapes are processed.
     private String readStringQuote1(int startCh, int endCh) {
-        // Position at start of string.
+        // Assumes the 1 character starting delimiter has been read.
+        // Reads the terminating delimiter.
         stringBuilder.setLength(0);
-        // Assumes first delimiter char read already.
-        // Reads terminating delimiter
 
         for (;;) {
             int ch = reader.readChar();
@@ -805,9 +810,13 @@ public final class TokenizerText implements Tokenizer
                 warning("Unicode non-character U+%04X in string", ch);
             if ( ch == EOF )
                 fatal("Broken token: %s", stringBuilder.toString());
-            else if ( ch == endCh )
-                return stringBuilder.toString();
-            else if ( ch == NL )
+            else if ( ch == endCh ) {
+                // Done!
+                String str = stringBuilder.toString();
+                if ( CHECK_RDFSTRING )
+                    checkRDFString(str);
+                return str;
+            } else if ( ch == NL )
                 fatal("Broken token (newline in string)", 
stringBuilder.toString());
             else if ( ch == CR )
                 fatal("Broken token (carriage return in string)", 
stringBuilder.toString());
@@ -823,7 +832,9 @@ public final class TokenizerText implements Tokenizer
         }
     }
 
-    private String readStringQuote3(int quoteChar, boolean endNL) {
+    private String readStringQuote3(int quoteChar) {
+        // Assumes the 3 character starting delimiter has been read.
+        // Reads the terminating delimiter.
         stringBuilder.setLength(0);
         for (;;) {
             int ch = reader.readChar();
@@ -833,13 +844,15 @@ public final class TokenizerText implements Tokenizer
                     warning("Unicode replacement character U+FFFD in string");
             }
             if ( ch == EOF ) {
-                if ( endNL )
-                    return stringBuilder.toString();
                 fatal("Broken long string");
-            }
-            else if ( ch == quoteChar ) {
-                if ( threeQuotes(quoteChar) )
-                    return stringBuilder.toString();
+            } else if ( ch == quoteChar ) {
+                if ( threeQuotes(quoteChar) ) {
+                    String str = stringBuilder.toString();
+                    if ( CHECK_RDFSTRING )
+                        checkRDFString(str);
+                    return str;
+                }
+                // quote, not triple. It is a normal character.
             } else if ( ch == CH_RSLASH )
                 ch = readLiteralEscape();
             insertCodepoint(stringBuilder, ch);
@@ -1249,65 +1262,23 @@ public final class TokenizerText implements Tokenizer
         return reader.getLineNum();
     }
 
-    // ---- Routines to check tokens
-
-    private void checkBlankNode(String blankNodeLabel) {
-        if ( checker != null )
-            checker.checkBlankNode(blankNodeLabel);
-    }
-
-    private void checkLiteralLang(String lexicalForm, String langTag) {
-        if ( checker != null )
-            checker.checkLiteralLang(lexicalForm, langTag);
-    }
-
-    private void checkLiteralDT(String lexicalForm, Token datatype) {
-        if ( checker != null )
-            checker.checkLiteralDT(lexicalForm, datatype);
-    }
-
-    private void checkString(String string) {
-        if ( checker != null )
-            checker.checkString(string);
-    }
-
-    private void checkURI(String uriStr) {
-        if ( checker != null )
-            checker.checkURI(uriStr);
-    }
-
-    private void checkNumber(String image, String datatype) {
-        if ( checker != null )
-            checker.checkNumber(image, datatype);
-    }
-
-    private void checkVariable(String tokenImage) {
-        if ( checker != null )
-            checker.checkVariable(tokenImage);
-    }
-
-    private void checkDirective(String directive) {
-        if ( checker != null )
-            checker.checkDirective(directive);
-    }
-
-    private void checkKeyword(String tokenImage) {
-        if ( checker != null )
-            checker.checkKeyword(tokenImage);
-    }
+    // ---- Escape sequences
 
-    private void checkPrefixedName(String tokenImage, String tokenImage2) {
-        if ( checker != null )
-            checker.checkPrefixedName(tokenImage, tokenImage2);
-    }
+    // Read a unicode escape : does not allow \\ bypass
+    private final int readUnicodeEscape() {
+        int ch = reader.readChar();
+        if ( ch == EOF )
+            fatal("Broken escape sequence");
 
-    private void checkControl(int code) {
-        if ( checker != null )
-            checker.checkControl(code);
+        switch (ch) {
+            case 'u': return readUnicode4Escape();
+            case 'U': return readUnicode8Escape();
+            default:
+                fatal("Illegal unicode escape sequence value: \\%c (0x%02X)", 
ch, ch);
+        }
+        return 0;
     }
 
-    // ---- Escape sequences
-
     private final int readLiteralEscape() {
         int c = reader.readChar();
         if ( c == EOF )
@@ -1325,7 +1296,7 @@ public final class TokenizerText implements Tokenizer
             case 'u':   return readUnicode4Escape();
             case 'U':   return readUnicode8Escape();
             default:
-                fatal("Illegal escape sequence value: %c (0x%02X)", c, c);
+                fatal("Illegal escape sequence value: %c (0x%02X)",c , c);
                 return 0;
         }
     }
@@ -1356,8 +1327,8 @@ public final class TokenizerText implements Tokenizer
 
     private final int readUnicode8Escape() {
         int ch8 = readHexSequence(8);
-        if ( ch8 > Character.MAX_CODE_POINT )
-            fatal("Illegal code point in \\U sequence value: 0x%08X", ch8);
+        if ( ! Character.isValidCodePoint(ch8) )
+            fatal("Illegal code point from \\U sequence value: 0x%08X", ch8);
         return ch8;
     }
 
@@ -1428,4 +1399,61 @@ public final class TokenizerText implements Tokenizer
         // provided error handler does not, we throw an exception.
         throw new RiotParseException(message, line, col);
     }
+
+    // ---- Routines to check tokens
+
+    private void checkBlankNode(String blankNodeLabel) {
+        if ( checker != null )
+            checker.checkBlankNode(blankNodeLabel);
+    }
+
+    private void checkLiteralLang(String lexicalForm, String langTag) {
+        if ( checker != null )
+            checker.checkLiteralLang(lexicalForm, langTag);
+    }
+
+    private void checkLiteralDT(String lexicalForm, Token datatype) {
+        if ( checker != null )
+            checker.checkLiteralDT(lexicalForm, datatype);
+    }
+
+    private void checkString(String string) {
+        if ( checker != null )
+            checker.checkString(string);
+    }
+
+    private void checkURI(String uriStr) {
+        if ( checker != null )
+            checker.checkURI(uriStr);
+    }
+
+    private void checkNumber(String image, String datatype) {
+        if ( checker != null )
+            checker.checkNumber(image, datatype);
+    }
+
+    private void checkVariable(String tokenImage) {
+        if ( checker != null )
+            checker.checkVariable(tokenImage);
+    }
+
+    private void checkDirective(String directive) {
+        if ( checker != null )
+            checker.checkDirective(directive);
+    }
+
+    private void checkKeyword(String tokenImage) {
+        if ( checker != null )
+            checker.checkKeyword(tokenImage);
+    }
+
+    private void checkPrefixedName(String tokenImage, String tokenImage2) {
+        if ( checker != null )
+            checker.checkPrefixedName(tokenImage, tokenImage2);
+    }
+
+    private void checkControl(int code) {
+        if ( checker != null )
+            checker.checkControl(code);
+    }
 }
diff --git 
a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerTextBuilder.java 
b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerTextBuilder.java
index bb564415d5..c0dc7e8e39 100644
--- 
a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerTextBuilder.java
+++ 
b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerTextBuilder.java
@@ -129,6 +129,6 @@ public class TokenizerTextBuilder {
             throw new IllegalStateException("No data source");
         }
 
-        return TokenizerText.internal(pr, singleLineMode, !utf8, errHandler);
+        return TokenizerText.internal(pr, singleLineMode, errHandler);
     }
 }
diff --git 
a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java 
b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java
index 44bdecd5e9..74277b0756 100644
--- a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java
+++ b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java
@@ -18,33 +18,33 @@
 
 package org.apache.jena.riot.lang;
 
-import static 
org.apache.jena.riot.system.ErrorHandlerFactory.errorHandlerNoLogging ;
-import static 
org.apache.jena.riot.system.ErrorHandlerFactory.getDefaultErrorHandler ;
-import static 
org.apache.jena.riot.system.ErrorHandlerFactory.setDefaultErrorHandler ;
+import static 
org.apache.jena.riot.system.ErrorHandlerFactory.errorHandlerNoLogging;
+import static 
org.apache.jena.riot.system.ErrorHandlerFactory.getDefaultErrorHandler;
+import static 
org.apache.jena.riot.system.ErrorHandlerFactory.setDefaultErrorHandler;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotEquals;
 import static org.junit.Assert.assertTrue;
 
-import java.io.StringReader ;
+import java.io.StringReader;
 
-import org.junit.AfterClass ;
-import org.junit.BeforeClass ;
-import org.junit.Test ;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
 
-import org.apache.jena.graph.Graph ;
-import org.apache.jena.graph.Triple ;
-import org.apache.jena.rdf.model.Model ;
-import org.apache.jena.rdf.model.ModelFactory ;
-import org.apache.jena.rdf.model.Property ;
-import org.apache.jena.rdf.model.Resource ;
+import org.apache.jena.graph.Graph;
+import org.apache.jena.graph.Triple;
+import org.apache.jena.rdf.model.Model;
+import org.apache.jena.rdf.model.ModelFactory;
+import org.apache.jena.rdf.model.Property;
+import org.apache.jena.rdf.model.Resource;
 import org.apache.jena.riot.ErrorHandlerTestLib.ExError;
-import org.apache.jena.riot.ErrorHandlerTestLib.ExFatal ;
-import org.apache.jena.riot.ErrorHandlerTestLib.ExWarning ;
-import org.apache.jena.riot.Lang ;
-import org.apache.jena.riot.RDFDataMgr ;
-import org.apache.jena.riot.RDFLanguages ;
-import org.apache.jena.riot.system.ErrorHandler ;
-import org.apache.jena.sparql.sse.SSE ;
+import org.apache.jena.riot.ErrorHandlerTestLib.ExFatal;
+import org.apache.jena.riot.ErrorHandlerTestLib.ExWarning;
+import org.apache.jena.riot.Lang;
+import org.apache.jena.riot.RDFDataMgr;
+import org.apache.jena.riot.RDFLanguages;
+import org.apache.jena.riot.system.ErrorHandler;
+import org.apache.jena.sparql.sse.SSE;
 
 public class TestLangTurtle
 {
@@ -140,63 +140,63 @@ public class TestLangTurtle
     }
 
     @Test
-    public void triple()                { parse("<s> <p> <o> .") ; }
+    public void triple()                { parse("<s> <p> <o> ."); }
 
     @Test(expected=ExFatal.class)
-    public void errorJunk_1()           { parse("<p>") ; }
+    public void errorJunk_1()           { parse("<p>"); }
 
     @Test(expected=ExFatal.class)
-    public void errorJunk_2()           { parse("<r> <p>") ; }
+    public void errorJunk_2()           { parse("<r> <p>"); }
 
     @Test(expected=ExFatal.class)
-    public void errorNoPrefixDef()      { parse("x:p <p> 'q' .") ; }
+    public void errorNoPrefixDef()      { parse("x:p <p> 'q' ."); }
 
     @Test(expected=ExFatal.class)
-    public void errorNoPrefixDefDT()    { parse("<p> <p> 'q'^^x:foo .") ; }
+    public void errorNoPrefixDefDT()    { parse("<p> <p> 'q'^^x:foo ."); }
 
     @Test(expected=ExFatal.class)
-    public void errorBadDatatype()      { parse("<p> <p> 'q'^^.") ; }
+    public void errorBadDatatype()      { parse("<p> <p> 'q'^^."); }
 
     @Test(expected=ExError.class)
-    public void errorBadURI_1()         { parse("<http://example/a b> 
<http://example/p> 123 .") ; }
+    public void errorBadURI_1()         { parse("<http://example/a b> 
<http://example/p> 123 ."); }
 
     @Test(expected=ExWarning.class)
     // Passes tokenization but fails IRI parsing.
-    public void errorBadURI_2()         { parse("<http://example/a%XAb> 
<http://example/p> 123 .") ; }
+    public void errorBadURI_2()         { parse("<http://example/a%XAb> 
<http://example/p> 123 ."); }
 
     // Bad URIs
     @Test (expected=ExError.class)
-    public void errorBadURI_3()         { parse("@prefix ex:  <bad iri> .  
ex:s ex:p 123 ") ; }
+    public void errorBadURI_3()         { parse("@prefix ex:  <bad iri> .  
ex:s ex:p 123 "); }
 
     @Test (expected=ExError.class)
-    public void errorBadURI_4()         { parse("<x> <p> 'number'^^<bad uri> 
") ; }
+    public void errorBadURI_4()         { parse("<x> <p> 'number'^^<bad uri> 
"); }
 
     // Structural errors.
     @Test (expected=ExFatal.class)
-    public void errorBadList_1()        { parse("<x> <p> (") ; }
+    public void errorBadList_1()        { parse("<x> <p> ("); }
 
     @Test (expected=ExFatal.class)
-    public void errorBadList_2()        { parse("<x> <p> ( <z>") ; }
+    public void errorBadList_2()        { parse("<x> <p> ( <z>"); }
 
     @Test
     public void turtle_01() {
-        Triple t = parseOneTriple("<s> <p> 123 . ") ;
-        Triple t2 = SSE.parseTriple("(<http://base/s> <http://base/p> 123)") ;
-        assertEquals(t2, t) ;
+        Triple t = parseOneTriple("<s> <p> 123 . ");
+        Triple t2 = SSE.parseTriple("(<http://base/s> <http://base/p> 123)");
+        assertEquals(t2, t);
     }
 
     @Test
     public void turtle_02() {
-        Triple t = parseOneTriple("@base <http://example/> . <s> <p> 123 . ") ;
-        Triple t2 = SSE.parseTriple("(<http://example/s> <http://example/p> 
123)") ;
-        assertEquals(t2, t) ;
+        Triple t = parseOneTriple("@base <http://example/> . <s> <p> 123 . ");
+        Triple t2 = SSE.parseTriple("(<http://example/s> <http://example/p> 
123)");
+        assertEquals(t2, t);
     }
 
     @Test
     public void turtle_03() {
-        Triple t = parseOneTriple("@prefix ex: <http://example/x/> . ex:s ex:p 
123 . ") ;
-        Triple t2 = SSE.parseTriple("(<http://example/x/s> 
<http://example/x/p> 123)") ;
-        assertEquals(t2, t) ;
+        Triple t = parseOneTriple("@prefix ex: <http://example/x/> . ex:s ex:p 
123 . ");
+        Triple t2 = SSE.parseTriple("(<http://example/x/s> 
<http://example/x/p> 123)");
+        assertEquals(t2, t);
     }
 
     // RDF 1.2 (some basic testing)
@@ -205,13 +205,13 @@ public class TestLangTurtle
 
     @Test
     public void turtle_rdf12_01() {
-        Graph graph = parse(PREFIXES, "<< :s :p 123 >> . ") ;
+        Graph graph = parse(PREFIXES, "<< :s :p 123 >> . ");
         assertEquals(1, graph.size());
     }
 
     @Test
     public void turtle_rdf12_02() {
-        Graph graph = parse(PREFIXES, "<< :s :p 123 >> :q 'abc' ") ;
+        Graph graph = parse(PREFIXES, "<< :s :p 123 >> :q 'abc' ");
         assertEquals(2, graph.size());
     }
 
@@ -235,42 +235,105 @@ public class TestLangTurtle
 
     @Test
     public void turtle_rdf12_11() {
-        Triple t = parseOneTriple("VERSION \"1.2\" <x:s> <x:p> 123 . ") ;
+        parseOneTriple("VERSION \"1.2\" <x:s> <x:p> 123 . ");
     }
 
     @Test
     public void turtle_rdf12_12() {
-        Triple t = parseOneTriple("VERSION '1.2' <x:s> <x:p> 123 . ") ;
+        parseOneTriple("VERSION '1.2' <x:s> <x:p> 123 . ");
     }
 
     @Test
     public void turtle_rdf12_13() {
-        Triple t = parseOneTriple("@version '1.2' . <x:s> <x:p> 123 . ") ;
+        parseOneTriple("@version '1.2' . <x:s> <x:p> 123 . ");
     }
 
     public void turtle_rdf12_14() {
-        Triple t = parseOneTriple("@version \"1.2\" . <x:s> <x:p> 123 . ") ;
+        parseOneTriple("@version \"1.2\" . <x:s> <x:p> 123 . ");
     }
 
     @Test (expected=ExFatal.class)
     public void turtle_rdf12_bad_11() {
-        Triple t = parseOneTriple("VERSION '1.2' . <x:s> <x:p> 123 . ") ;
+        parseOneTriple("VERSION '1.2' . <x:s> <x:p> 123 . ");
     }
 
     @Test (expected=ExFatal.class)
     public void turtle_rdf12_bad_12() {
-        Triple t = parseOneTriple("VERSION '''1.2''' <x:s> <x:p> 123 . ") ;
+        parseOneTriple("VERSION '''1.2''' <x:s> <x:p> 123 . ");
     }
 
     @Test (expected=ExFatal.class)
     public void turtle_rdf12_bad_13() {
-        Triple t = parseOneTriple("@version \"\"\"1.2\"\"\" <x:s> <x:p> 123 . 
") ;
+        parseOneTriple("@version \"\"\"1.2\"\"\" <x:s> <x:p> 123 . ");
+    }
+
+    // U+D800-U+DBFF is a high surrogate (first part of a pair)
+    // U+DC00-U+DFFF is a low surrogate (second part of a pair)
+    // so D800-DC00 is legal.
+
+    @Test public void turtle_surrogate_1() {
+        // escaped high, escaped low
+        parseOneTriple("<x:s> <x:p> '\\ud800\\udc00' . ");
+    }
+
+    @Test public void turtle_surrogate_2() {
+        // escaped high, raw low
+        parseOneTriple("<x:s> <x:p> '\\ud800\udc00' . ");
+    }
+
+    // Compilation failure. (maven+openjdk - OK in Eclipse, and test correct)
+//    @Test public void turtle_surrogate_3() {
+//        // raw high, escaped low
+//        parseOneTriple("<x:s> <x:p> '\ud800\\udc00' . ");
+//    }
+
+    @Test public void turtle_surrogate_4() {
+        // raw high, raw low
+        parseOneTriple("<x:s> <x:p> '\ud800\udc00' . ");
+    }
+
+    @Test (expected=ExFatal.class)
+    public void turtle_bad_surrogate_1() {
+        parseOneTriple("<x:s> <x:p> '\\ud800' . ");
+    }
+
+    @Test (expected=ExFatal.class)
+    public void turtle_bad_surrogate_2() {
+        parseOneTriple("<x:s> <x:p> '\\udfff' . ");
+    }
+    @Test (expected=ExFatal.class)
+    public void turtle_bad_surrogate_3() {
+        parseOneTriple("<x:s> <x:p> '\\U0000d800' . ");
+    }
+
+    @Test (expected=ExFatal.class)
+    public void turtle_bad_surrogate_4() {
+        parseOneTriple("<x:s> <x:p> '\\U0000dfff' . ");
+    }
+
+    @Test (expected=ExFatal.class)
+    public void turtle_bad_surrogate_5() {
+        // Wrong way round: low-high
+        parseOneTriple("<x:s> <x:p> '\\uc800\\ud800' . ");
+    }
+
+    // Compilation failure. Can't write \ud800
+//    @Test (expected=ExFatal.class)
+//    public void turtle_bad_surrogate_6() {
+//        // raw low - escaped high
+//        parseOneTriple("<x:s> <x:p> '\ud800\\ud800' . ");
+//    }
+
+    @Test (expected=ExFatal.class)
+    public void turtle_bad_surrogate_7() {
+        // escaped low - raw high
+        parseOneTriple("<x:s> <x:p> '\\uc800\ud800' . ");
     }
 
     // No Formulae. Not trig.
     @Test (expected=ExFatal.class)
-    public void turtle_50()     { parse("@prefix ex:  <http://example/> .  { 
ex:s ex:p 123 . } ") ; }
+    public void turtle_50()     { parse("@prefix ex:  <http://example/> .  { 
ex:s ex:p 123 . } "); }
 
     @Test (expected=ExWarning.class)
-    public void turtle_60()     { parse("@prefix xsd:  
<http://www.w3.org/2001/XMLSchema#> . <x> <p> 'number'^^xsd:byte }") ; }
+    public void turtle_60()     { parse("@prefix xsd:  
<http://www.w3.org/2001/XMLSchema#> . <x> <p> 'number'^^xsd:byte }"); }
 }
diff --git 
a/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java 
b/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java
index b4c2865afb..bc2226a97a 100644
--- a/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java
+++ b/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java
@@ -1352,6 +1352,135 @@ public class TestTokenizerText {
         assertFalse(tokenizer.hasNext());
     }
 
+    // U+D800-U+DBFF is a high surrogate (first part of a pair)
+    // U+DC00-U+DFFF is a low surrogate (second part of a pair)
+    // so D800-DC00 is legal.
+
+    @Test public void turtle_surrogate_pair_01() {
+        // escaped high, escaped low
+        surrogate("'\\ud800\\udc00'");
+    }
+
+    @Test public void turtle_surrogate_pair_02() {
+        // escaped high, raw low
+        surrogate("'\\ud800\udc00'");
+    }
+
+    // Compilation failure - illegal escape character
+//    @Test public void turtle_surrogate_pair_03() {
+//        // raw high, escaped low
+//        surrogate("'\ud800\\udc00'");
+//    }
+
+    @Test public void turtle_surrogate_pair_04() {
+        // raw high, raw low
+        surrogate("'\ud800\udc00'");
+    }
+
+    @Test public void turtle_surrogate_pair_05() {
+        // escaped high, escaped low
+        surrogate("'a\\ud800\\udc00x'");
+    }
+
+    @Test public void turtle_surrogate_pair_06() {
+        // escaped high, raw low
+        surrogate("'z\\ud800\udc00'z");
+    }
+
+    // Compilation failure - illegal escape character
+//    @Test public void turtle_surrogate_pair_07() {
+//        // raw high, escaped low
+//        surrogate("'a\ud800\\udc00'z");
+//    }
+
+    @Test public void turtle_surrogate_pair_08() {
+        // raw high, raw low
+        surrogate("'a\ud800\udc00'z");
+    }
+
+    @Test (expected=RiotParseException.class)
+    public void turtle_bad_surrogate_01() {
+        surrogate("'\\ud800'");
+    }
+
+    @Test (expected=RiotParseException.class)
+    public void turtle_bad_surrogate_02() {
+        surrogate("'a\\ud800z'");
+    }
+
+    @Test (expected=RiotParseException.class)
+    public void turtle_bad_surrogate_03() {
+        surrogate("'\\udfff'");
+    }
+
+    @Test (expected=RiotParseException.class)
+    public void turtle_bad_surrogate_04() {
+        surrogate("'a\\udfffz'");
+    }
+
+    @Test (expected=RiotParseException.class)
+    public void turtle_bad_surrogate_05() {
+        surrogate("'\\U0000d800'");
+    }
+
+    @Test (expected=RiotParseException.class)
+    public void turtle_bad_surrogate_06() {
+        surrogate("'a\\U0000d800z'");
+    }
+
+    @Test (expected=RiotParseException.class)
+    public void turtle_bad_surrogate_07() {
+        surrogate("'\\U0000dfff'");
+    }
+
+    @Test (expected=RiotParseException.class)
+    public void turtle_bad_surrogate_08() {
+        surrogate("'a\\U0000dfffz'");
+    }
+
+    @Test (expected=RiotParseException.class)
+    public void turtle_bad_surrogate_09() {
+        // Wrong way round: low-high
+        surrogate("'\\uc800\\ud800'");
+    }
+
+    @Test (expected=RiotParseException.class)
+    public void turtle_bad_surrogate_10() {
+        // Wrong way round: low-high
+        surrogate("'a\\uc800\\ud800z'");
+    }
+
+    // Compilation failure - illegal escape character
+//    @Test (expected=RiotParseException.class)
+//    public void turtle_bad_surrogate_11() {
+//        // raw low - escaped high
+//        surrogate("'\ud800\\ud800'");
+//    }
+//
+//    @Test (expected=RiotParseException.class)
+//    public void turtle_bad_surrogate_12() {
+//        // raw low - escaped high
+//        surrogate("'a\ud800\\ud800z'");
+//    }
+
+    @Test (expected=RiotParseException.class)
+    public void turtle_bad_surrogate_13() {
+        // escaped low - raw high
+        surrogate("'\\uc800\ud800'");
+    }
+
+    @Test (expected=RiotParseException.class)
+    public void turtle_bad_surrogate_14() {
+        // escaped low - raw high
+        surrogate("'a\\uc800\ud800z'");
+    }
+
+    private void surrogate(String string) {
+        Tokenizer tokenizer = tokenizer(string);
+        tokenizer.hasNext();
+        tokenizer.next();
+    }
+
     @Test
     public void token_rdf_star_reified_1() {
         Tokenizer tokenizer = tokenizer("<<");
diff --git a/jena-base/src/main/java/org/apache/jena/atlas/io/CharStream.java 
b/jena-base/src/main/java/org/apache/jena/atlas/io/CharStream.java
index a5078e0d57..5f2e6babd2 100644
--- a/jena-base/src/main/java/org/apache/jena/atlas/io/CharStream.java
+++ b/jena-base/src/main/java/org/apache/jena/atlas/io/CharStream.java
@@ -21,7 +21,6 @@ package org.apache.jena.atlas.io;
 
 /**
  * A simplified reader interface without IOExceptions.
- * It's an interface, not an abstract class
  */
 public interface CharStream
 {
diff --git a/jena-base/src/main/java/org/apache/jena/atlas/io/PeekReader.java 
b/jena-base/src/main/java/org/apache/jena/atlas/io/PeekReader.java
index ae0efecaa5..1d24437d26 100644
--- a/jena-base/src/main/java/org/apache/jena/atlas/io/PeekReader.java
+++ b/jena-base/src/main/java/org/apache/jena/atlas/io/PeekReader.java
@@ -34,11 +34,8 @@ import org.apache.jena.atlas.lib.Chars;
 public final class PeekReader extends Reader {
     // Remember to apply fixes to PeekInputStream as well.
 
-    // Buffering is done by a CharStream - does it make difference?
-    // Yes. A lot (Java6).
-
-    // Using a Reader here seems to have zero cost or benefit but CharStream
-    // allows fast String handling.
+    // Buffering is done by a CharStream - it makes a difference
+    // CharStream faster than a Reader.
     private final CharStream source;
 
     private static final int PUSHBACK_SIZE = 10;
diff --git 
a/jena-tdb1/src/test/java/org/apache/jena/tdb1/store/nodetable/TestNodec.java 
b/jena-tdb1/src/test/java/org/apache/jena/tdb1/store/nodetable/TestNodec.java
index 872b90fe0c..e62eed6b12 100644
--- 
a/jena-tdb1/src/test/java/org/apache/jena/tdb1/store/nodetable/TestNodec.java
+++ 
b/jena-tdb1/src/test/java/org/apache/jena/tdb1/store/nodetable/TestNodec.java
@@ -80,12 +80,8 @@ public class TestNodec
     @Test public void nodec_lit_22()    { test ("''^^<>"); }
 
     // Bad Unicode.
-    static private final String binaryStr1  = "abc\uD800xyz";    // A single 
surrogate, without it's pair.
-    static private final String binaryStr2  = "\uD800";          // A single 
surrogate, without it's pair.
-    static private final String binaryStr3  = "\u0000";          // A zero 
character
+    static private final String binaryStr3  = "\u0000";                 // A 
zero character
 
-    @Test public void nodec_lit_30()    { test ("'"+binaryStr1+"'"); }
-    @Test public void nodec_lit_31()    { test ("'"+binaryStr2+"'"); }
     @Test public void nodec_lit_32()    { test ("'"+binaryStr3+"'"); }
 
     @Test public void nodec_lit_33()    { test("'\uFFFD'"); }

(jena) 06/07: GH-3281: Update TokenizerText to check RDF Strings

Reply via email to