This is an automated email from the ASF dual-hosted git repository. andy pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/jena.git
commit 530d0a727cf05641372f55efbe9b5ce2c7d8ce79 Author: Andy Seaborne <[email protected]> AuthorDate: Thu Jun 26 19:07:50 2025 +0100 GH-3281: Update TokenizerText to check RDF Strings --- .../org/apache/jena/riot/tokens/TokenizerText.java | 290 +++++++++++---------- .../jena/riot/tokens/TokenizerTextBuilder.java | 2 +- .../org/apache/jena/riot/lang/TestLangTurtle.java | 167 ++++++++---- .../apache/jena/riot/tokens/TestTokenizerText.java | 129 +++++++++ .../java/org/apache/jena/atlas/io/CharStream.java | 1 - .../java/org/apache/jena/atlas/io/PeekReader.java | 7 +- .../jena/tdb1/store/nodetable/TestNodec.java | 6 +- 7 files changed, 407 insertions(+), 195 deletions(-) diff --git a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java index 9e315870c6..52547b2d50 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java @@ -43,10 +43,13 @@ public final class TokenizerText implements Tokenizer // do the check later in the parsing process. In case a need arises, the code // remains, all compiled away by "if ( false )" (javac does not generate any // bytecodes and even if it it did, JIT will remove dead branches). - private static final boolean Checking = false; + private static final boolean CHECKING = false; // Optional checker. private final TokenChecker checker = null; + // Whether to check for legal RDF strings (no ill formed use of surrogates) + private static final boolean CHECK_RDFSTRING = true; + // Workspace for building token images. // Reusing a StringBuilder is faster than allocating a fresh one each time. private final StringBuilder stringBuilder = new StringBuilder(200); @@ -55,8 +58,6 @@ public final class TokenizerText implements Tokenizer private final PeekReader reader; // Whether whitespace between tokens includes newlines (in various forms). private final boolean singleLineMode; - // Indicator. The PeekReader should throw java.nio.charset.MalformedInputException - private final boolean isASCII; // The code assumes that errors throw exception and so stop parsing. private final ErrorHandler errorHandler; @@ -67,15 +68,14 @@ public final class TokenizerText implements Tokenizer public static Tokenizer fromString(String string) { return create().fromString(string).build(); } - /*package*/ static TokenizerText internal(PeekReader reader, boolean singleLineMode, boolean isASCII, ErrorHandler errorHandler) { - return new TokenizerText(reader, singleLineMode, isASCII, errorHandler); + /*package*/ static TokenizerText internal(PeekReader reader, boolean singleLineMode, ErrorHandler errorHandler) { + return new TokenizerText(reader, singleLineMode, errorHandler); } - private TokenizerText(PeekReader reader, boolean singleLineMode, boolean isASCII, ErrorHandler errorHandler) { + private TokenizerText(PeekReader reader, boolean singleLineMode, ErrorHandler errorHandler) { this.reader = Objects.requireNonNull(reader, "PeekReader"); this.singleLineMode = singleLineMode; this.errorHandler = Objects.requireNonNull(errorHandler, "ErrorHandler"); - this.isASCII = isASCII; } @Override @@ -183,7 +183,7 @@ public final class TokenizerText implements Tokenizer // '<' not '<<' token.setImage(readIRI()); token.setType(TokenType.IRI); - if ( Checking ) + if ( CHECKING ) checkURI(token.getImage()); return token; } @@ -215,7 +215,7 @@ public final class TokenizerText implements Tokenizer int ch3 = reader.peekChar(); if ( ch3 == ch ) { reader.readChar(); // Read potential third quote. - token.setImage(readStringQuote3(ch, false)); + token.setImage(readStringQuote3(ch)); StringType st = (ch == CH_QUOTE1) ? StringType.LONG_STRING1 : StringType.LONG_STRING2; token.setStringType(st); } else { @@ -224,8 +224,7 @@ public final class TokenizerText implements Tokenizer // No need to pushback characters as we know the lexical // form is the empty string. // if ( ch2 != EOF ) reader.pushbackChar(ch2); - // if ( ch1 != EOF ) reader.pushbackChar(ch1); // Must be - // '' or "" + // if ( ch1 != EOF ) reader.pushbackChar(ch1); token.setImage(""); StringType st = (ch == CH_QUOTE1) ? StringType.STRING1 : StringType.STRING2; token.setStringType(st); @@ -251,7 +250,7 @@ public final class TokenizerText implements Tokenizer mainToken.setSubToken1(token); mainToken.setImage2(langTag()); token = mainToken; - if ( Checking ) + if ( CHECKING ) checkLiteralLang(token.getImage(), token.getImage2()); } else if ( reader.peekChar() == '^' ) { expect("^^"); @@ -276,11 +275,11 @@ public final class TokenizerText implements Tokenizer mainToken.setType(TokenType.LITERAL_DT); token = mainToken; - if ( Checking ) + if ( CHECKING ) checkLiteralDT(token.getImage(), subToken); } else { // Was a simple string. - if ( Checking ) + if ( CHECKING ) checkString(token.getImage()); } return token; @@ -293,7 +292,7 @@ public final class TokenizerText implements Tokenizer reader.readChar(); token.setImage(readBlankNodeLabel()); token.setType(TokenType.BNODE); - if ( Checking ) checkBlankNode(token.getImage()); + if ( CHECKING ) checkBlankNode(token.getImage()); return token; } token.setType(TokenType.UNDERSCORE); @@ -306,7 +305,7 @@ public final class TokenizerText implements Tokenizer reader.readChar(); token.setType(TokenType.DIRECTIVE); token.setImage(readWord(false)); - if ( Checking ) + if ( CHECKING ) checkDirective(token.getImage()); return token; } @@ -317,7 +316,7 @@ public final class TokenizerText implements Tokenizer token.setType(TokenType.VAR); // Character set? token.setImage(readVarName()); - if ( Checking ) + if ( CHECKING ) checkVariable(token.getImage()); return token; } @@ -336,7 +335,7 @@ public final class TokenizerText implements Tokenizer reader.pushbackChar(CH_DOT); boolean charactersConsumed = readNumber(CH_ZERO, false); if ( charactersConsumed ) { - if ( Checking ) + if ( CHECKING ) checkNumber(token.getImage(), token.getImage2()); return token; } @@ -517,7 +516,7 @@ public final class TokenizerText implements Tokenizer readPrefixedNameOrKeyword(token); - if ( Checking ) checkKeyword(token.getImage()); + if ( CHECKING ) checkKeyword(token.getImage()); return token; } @@ -539,17 +538,14 @@ public final class TokenizerText implements Tokenizer fatal("Broken IRI (CR): %s", stringBuilder.toString()); return null; case CH_GT: // Done! - return stringBuilder.toString(); + String str = stringBuilder.toString(); + if ( CHECK_RDFSTRING ) + checkRDFString(str); + return str; case CH_RSLASH: - if ( VeryVeryLaxIRI ) - // Includes unicode escapes and also \n etc - ch = readLiteralEscape(); - else - // NORMAL - ch = readUnicodeEscape(); + ch = readUnicodeEscape(); // Don't check legality of ch (strict syntax at this point). - // That does not mean it is a good idea to bypass checking. - // Bad characters will lead to trouble elsewhere. + // IRI parsing will catch errors. break; case CH_LT: // Probably a corrupt file so treat as fatal. @@ -558,7 +554,7 @@ public final class TokenizerText implements Tokenizer error("Bad character in IRI (tab character): <%s[tab]...>", stringBuilder.toString()); break; case '{': case '}': case '"': case '|': case '^': case '`' : if ( ! VeryVeryLaxIRI ) - warning("Illegal character in IRI (codepoint 0x%02X, '%c'): <%s[%c]...>", ch, (char)ch, stringBuilder.toString(), (char)ch); + warning("Illegal character in IRI (codepoint U+%04X, '%c'): <%s[%c]...>", ch, (char)ch, stringBuilder.toString(), (char)ch); break; case SPC: if ( ! AllowSpacesInIRI ) @@ -578,21 +574,6 @@ public final class TokenizerText implements Tokenizer } } - // Read a unicode escape : does not allow \\ bypass - private final int readUnicodeEscape() { - int ch = reader.readChar(); - if ( ch == EOF ) - fatal("Broken escape sequence"); - - switch (ch) { - case 'u': return readUnicode4Escape(); - case 'U': return readUnicode8Escape(); - default: - fatal("Illegal unicode escape sequence value: \\%c (0x%02X)", ch, ch); - } - return 0; - } - private void readPrefixedNameOrKeyword(Token token) { long posn = reader.getPosition(); String prefixPart = readPrefixPart(); // Prefix part or keyword @@ -604,7 +585,7 @@ public final class TokenizerText implements Tokenizer token.setType(TokenType.PREFIXED_NAME); String ln = readLocalPart(); // Local part token.setImage2(ln); - if ( Checking ) + if ( CHECKING ) checkPrefixedName(token.getImage(), token.getImage2()); } @@ -613,7 +594,7 @@ public final class TokenizerText implements Tokenizer if ( posn == reader.getPosition() ) fatal("Failed to find a prefix name or keyword: %c(%d;0x%04X)", ch, ch, ch); - if ( Checking ) + if ( CHECKING ) checkKeyword(token.getImage()); } @@ -759,40 +740,64 @@ public final class TokenizerText implements Tokenizer } // Process PLX (percent or character escape for a prefixed name) - private void processPLX(int ch) - { - if ( ch == CH_PERCENT ) - { + private void processPLX(int ch) { + if ( ch == CH_PERCENT ) { insertCodepointDirect(stringBuilder, ch); ch = reader.peekChar(); - if ( ! isHexChar(ch) ) - fatal("Not a hex character: '%c'",ch); + if ( !isHexChar(ch) ) + fatal("Not a hex character: '%c'", ch); insertCodepointDirect(stringBuilder, ch); reader.readChar(); ch = reader.peekChar(); - if ( ! isHexChar(ch) ) - fatal("Not a hex character: '%c'",ch); + if ( !isHexChar(ch) ) + fatal("Not a hex character: '%c'", ch); insertCodepointDirect(stringBuilder, ch); reader.readChar(); - } - else if ( ch == CH_RSLASH ) - { + } else if ( ch == CH_RSLASH ) { ch = readCharEscape(); insertCodepoint(stringBuilder, ch); - } - else + } else throw new ARQInternalErrorException("Not a '\\' or a '%' character"); } + /** + * Apply any checks for "RDF String" to a string that has already had escape processing applied. + * An RDF String is a sequence of codepoints in the range U+0000 to U+10FFFF, excluding surrogates. + * Because this is java, we test for no non-paired surrogates. + * A surrogate pair is high-low. + * This check is performed in readIRI, readStrignQuote1, and readStringQuote3 + */ + private void checkRDFString(String string) { + for ( int i = 0 ; i < string.length() ; i++ ) { + // Not "codePointAt" which does surrogate processing. + char ch = string.charAt(i); + + if ( ! Character.isValidCodePoint(ch) ) + warning("Illegal code point in \\U sequence value: 0x%08X", ch); + + // Check surrogate pairs are pairs. + if ( Character.isHighSurrogate(ch) ) { + i++; + if ( i == string.length() ) + fatal("Bad surrogate pair (end of string)"); + char ch1 = string.charAt(i); + if ( ! Character.isLowSurrogate(ch1) ) { + fatal("Bad surrogate pair (high surrogate not followed by low surrogate)"); + } + } else if ( Character.isLowSurrogate(ch) ) { + fatal("Bad surrogate pair (low surrogate not preceded by a high surrogate)"); + } + } + } + // Get characters between two markers. - // strEscapes may be processed + // String escapes are processed. private String readStringQuote1(int startCh, int endCh) { - // Position at start of string. + // Assumes the 1 character starting delimiter has been read. + // Reads the terminating delimiter. stringBuilder.setLength(0); - // Assumes first delimiter char read already. - // Reads terminating delimiter for (;;) { int ch = reader.readChar(); @@ -805,9 +810,13 @@ public final class TokenizerText implements Tokenizer warning("Unicode non-character U+%04X in string", ch); if ( ch == EOF ) fatal("Broken token: %s", stringBuilder.toString()); - else if ( ch == endCh ) - return stringBuilder.toString(); - else if ( ch == NL ) + else if ( ch == endCh ) { + // Done! + String str = stringBuilder.toString(); + if ( CHECK_RDFSTRING ) + checkRDFString(str); + return str; + } else if ( ch == NL ) fatal("Broken token (newline in string)", stringBuilder.toString()); else if ( ch == CR ) fatal("Broken token (carriage return in string)", stringBuilder.toString()); @@ -823,7 +832,9 @@ public final class TokenizerText implements Tokenizer } } - private String readStringQuote3(int quoteChar, boolean endNL) { + private String readStringQuote3(int quoteChar) { + // Assumes the 3 character starting delimiter has been read. + // Reads the terminating delimiter. stringBuilder.setLength(0); for (;;) { int ch = reader.readChar(); @@ -833,13 +844,15 @@ public final class TokenizerText implements Tokenizer warning("Unicode replacement character U+FFFD in string"); } if ( ch == EOF ) { - if ( endNL ) - return stringBuilder.toString(); fatal("Broken long string"); - } - else if ( ch == quoteChar ) { - if ( threeQuotes(quoteChar) ) - return stringBuilder.toString(); + } else if ( ch == quoteChar ) { + if ( threeQuotes(quoteChar) ) { + String str = stringBuilder.toString(); + if ( CHECK_RDFSTRING ) + checkRDFString(str); + return str; + } + // quote, not triple. It is a normal character. } else if ( ch == CH_RSLASH ) ch = readLiteralEscape(); insertCodepoint(stringBuilder, ch); @@ -1249,65 +1262,23 @@ public final class TokenizerText implements Tokenizer return reader.getLineNum(); } - // ---- Routines to check tokens - - private void checkBlankNode(String blankNodeLabel) { - if ( checker != null ) - checker.checkBlankNode(blankNodeLabel); - } - - private void checkLiteralLang(String lexicalForm, String langTag) { - if ( checker != null ) - checker.checkLiteralLang(lexicalForm, langTag); - } - - private void checkLiteralDT(String lexicalForm, Token datatype) { - if ( checker != null ) - checker.checkLiteralDT(lexicalForm, datatype); - } - - private void checkString(String string) { - if ( checker != null ) - checker.checkString(string); - } - - private void checkURI(String uriStr) { - if ( checker != null ) - checker.checkURI(uriStr); - } - - private void checkNumber(String image, String datatype) { - if ( checker != null ) - checker.checkNumber(image, datatype); - } - - private void checkVariable(String tokenImage) { - if ( checker != null ) - checker.checkVariable(tokenImage); - } - - private void checkDirective(String directive) { - if ( checker != null ) - checker.checkDirective(directive); - } - - private void checkKeyword(String tokenImage) { - if ( checker != null ) - checker.checkKeyword(tokenImage); - } + // ---- Escape sequences - private void checkPrefixedName(String tokenImage, String tokenImage2) { - if ( checker != null ) - checker.checkPrefixedName(tokenImage, tokenImage2); - } + // Read a unicode escape : does not allow \\ bypass + private final int readUnicodeEscape() { + int ch = reader.readChar(); + if ( ch == EOF ) + fatal("Broken escape sequence"); - private void checkControl(int code) { - if ( checker != null ) - checker.checkControl(code); + switch (ch) { + case 'u': return readUnicode4Escape(); + case 'U': return readUnicode8Escape(); + default: + fatal("Illegal unicode escape sequence value: \\%c (0x%02X)", ch, ch); + } + return 0; } - // ---- Escape sequences - private final int readLiteralEscape() { int c = reader.readChar(); if ( c == EOF ) @@ -1325,7 +1296,7 @@ public final class TokenizerText implements Tokenizer case 'u': return readUnicode4Escape(); case 'U': return readUnicode8Escape(); default: - fatal("Illegal escape sequence value: %c (0x%02X)", c, c); + fatal("Illegal escape sequence value: %c (0x%02X)",c , c); return 0; } } @@ -1356,8 +1327,8 @@ public final class TokenizerText implements Tokenizer private final int readUnicode8Escape() { int ch8 = readHexSequence(8); - if ( ch8 > Character.MAX_CODE_POINT ) - fatal("Illegal code point in \\U sequence value: 0x%08X", ch8); + if ( ! Character.isValidCodePoint(ch8) ) + fatal("Illegal code point from \\U sequence value: 0x%08X", ch8); return ch8; } @@ -1428,4 +1399,61 @@ public final class TokenizerText implements Tokenizer // provided error handler does not, we throw an exception. throw new RiotParseException(message, line, col); } + + // ---- Routines to check tokens + + private void checkBlankNode(String blankNodeLabel) { + if ( checker != null ) + checker.checkBlankNode(blankNodeLabel); + } + + private void checkLiteralLang(String lexicalForm, String langTag) { + if ( checker != null ) + checker.checkLiteralLang(lexicalForm, langTag); + } + + private void checkLiteralDT(String lexicalForm, Token datatype) { + if ( checker != null ) + checker.checkLiteralDT(lexicalForm, datatype); + } + + private void checkString(String string) { + if ( checker != null ) + checker.checkString(string); + } + + private void checkURI(String uriStr) { + if ( checker != null ) + checker.checkURI(uriStr); + } + + private void checkNumber(String image, String datatype) { + if ( checker != null ) + checker.checkNumber(image, datatype); + } + + private void checkVariable(String tokenImage) { + if ( checker != null ) + checker.checkVariable(tokenImage); + } + + private void checkDirective(String directive) { + if ( checker != null ) + checker.checkDirective(directive); + } + + private void checkKeyword(String tokenImage) { + if ( checker != null ) + checker.checkKeyword(tokenImage); + } + + private void checkPrefixedName(String tokenImage, String tokenImage2) { + if ( checker != null ) + checker.checkPrefixedName(tokenImage, tokenImage2); + } + + private void checkControl(int code) { + if ( checker != null ) + checker.checkControl(code); + } } diff --git a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerTextBuilder.java b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerTextBuilder.java index bb564415d5..c0dc7e8e39 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerTextBuilder.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerTextBuilder.java @@ -129,6 +129,6 @@ public class TokenizerTextBuilder { throw new IllegalStateException("No data source"); } - return TokenizerText.internal(pr, singleLineMode, !utf8, errHandler); + return TokenizerText.internal(pr, singleLineMode, errHandler); } } diff --git a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java index 44bdecd5e9..74277b0756 100644 --- a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java +++ b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java @@ -18,33 +18,33 @@ package org.apache.jena.riot.lang; -import static org.apache.jena.riot.system.ErrorHandlerFactory.errorHandlerNoLogging ; -import static org.apache.jena.riot.system.ErrorHandlerFactory.getDefaultErrorHandler ; -import static org.apache.jena.riot.system.ErrorHandlerFactory.setDefaultErrorHandler ; +import static org.apache.jena.riot.system.ErrorHandlerFactory.errorHandlerNoLogging; +import static org.apache.jena.riot.system.ErrorHandlerFactory.getDefaultErrorHandler; +import static org.apache.jena.riot.system.ErrorHandlerFactory.setDefaultErrorHandler; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; -import java.io.StringReader ; +import java.io.StringReader; -import org.junit.AfterClass ; -import org.junit.BeforeClass ; -import org.junit.Test ; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; -import org.apache.jena.graph.Graph ; -import org.apache.jena.graph.Triple ; -import org.apache.jena.rdf.model.Model ; -import org.apache.jena.rdf.model.ModelFactory ; -import org.apache.jena.rdf.model.Property ; -import org.apache.jena.rdf.model.Resource ; +import org.apache.jena.graph.Graph; +import org.apache.jena.graph.Triple; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ModelFactory; +import org.apache.jena.rdf.model.Property; +import org.apache.jena.rdf.model.Resource; import org.apache.jena.riot.ErrorHandlerTestLib.ExError; -import org.apache.jena.riot.ErrorHandlerTestLib.ExFatal ; -import org.apache.jena.riot.ErrorHandlerTestLib.ExWarning ; -import org.apache.jena.riot.Lang ; -import org.apache.jena.riot.RDFDataMgr ; -import org.apache.jena.riot.RDFLanguages ; -import org.apache.jena.riot.system.ErrorHandler ; -import org.apache.jena.sparql.sse.SSE ; +import org.apache.jena.riot.ErrorHandlerTestLib.ExFatal; +import org.apache.jena.riot.ErrorHandlerTestLib.ExWarning; +import org.apache.jena.riot.Lang; +import org.apache.jena.riot.RDFDataMgr; +import org.apache.jena.riot.RDFLanguages; +import org.apache.jena.riot.system.ErrorHandler; +import org.apache.jena.sparql.sse.SSE; public class TestLangTurtle { @@ -140,63 +140,63 @@ public class TestLangTurtle } @Test - public void triple() { parse("<s> <p> <o> .") ; } + public void triple() { parse("<s> <p> <o> ."); } @Test(expected=ExFatal.class) - public void errorJunk_1() { parse("<p>") ; } + public void errorJunk_1() { parse("<p>"); } @Test(expected=ExFatal.class) - public void errorJunk_2() { parse("<r> <p>") ; } + public void errorJunk_2() { parse("<r> <p>"); } @Test(expected=ExFatal.class) - public void errorNoPrefixDef() { parse("x:p <p> 'q' .") ; } + public void errorNoPrefixDef() { parse("x:p <p> 'q' ."); } @Test(expected=ExFatal.class) - public void errorNoPrefixDefDT() { parse("<p> <p> 'q'^^x:foo .") ; } + public void errorNoPrefixDefDT() { parse("<p> <p> 'q'^^x:foo ."); } @Test(expected=ExFatal.class) - public void errorBadDatatype() { parse("<p> <p> 'q'^^.") ; } + public void errorBadDatatype() { parse("<p> <p> 'q'^^."); } @Test(expected=ExError.class) - public void errorBadURI_1() { parse("<http://example/a b> <http://example/p> 123 .") ; } + public void errorBadURI_1() { parse("<http://example/a b> <http://example/p> 123 ."); } @Test(expected=ExWarning.class) // Passes tokenization but fails IRI parsing. - public void errorBadURI_2() { parse("<http://example/a%XAb> <http://example/p> 123 .") ; } + public void errorBadURI_2() { parse("<http://example/a%XAb> <http://example/p> 123 ."); } // Bad URIs @Test (expected=ExError.class) - public void errorBadURI_3() { parse("@prefix ex: <bad iri> . ex:s ex:p 123 ") ; } + public void errorBadURI_3() { parse("@prefix ex: <bad iri> . ex:s ex:p 123 "); } @Test (expected=ExError.class) - public void errorBadURI_4() { parse("<x> <p> 'number'^^<bad uri> ") ; } + public void errorBadURI_4() { parse("<x> <p> 'number'^^<bad uri> "); } // Structural errors. @Test (expected=ExFatal.class) - public void errorBadList_1() { parse("<x> <p> (") ; } + public void errorBadList_1() { parse("<x> <p> ("); } @Test (expected=ExFatal.class) - public void errorBadList_2() { parse("<x> <p> ( <z>") ; } + public void errorBadList_2() { parse("<x> <p> ( <z>"); } @Test public void turtle_01() { - Triple t = parseOneTriple("<s> <p> 123 . ") ; - Triple t2 = SSE.parseTriple("(<http://base/s> <http://base/p> 123)") ; - assertEquals(t2, t) ; + Triple t = parseOneTriple("<s> <p> 123 . "); + Triple t2 = SSE.parseTriple("(<http://base/s> <http://base/p> 123)"); + assertEquals(t2, t); } @Test public void turtle_02() { - Triple t = parseOneTriple("@base <http://example/> . <s> <p> 123 . ") ; - Triple t2 = SSE.parseTriple("(<http://example/s> <http://example/p> 123)") ; - assertEquals(t2, t) ; + Triple t = parseOneTriple("@base <http://example/> . <s> <p> 123 . "); + Triple t2 = SSE.parseTriple("(<http://example/s> <http://example/p> 123)"); + assertEquals(t2, t); } @Test public void turtle_03() { - Triple t = parseOneTriple("@prefix ex: <http://example/x/> . ex:s ex:p 123 . ") ; - Triple t2 = SSE.parseTriple("(<http://example/x/s> <http://example/x/p> 123)") ; - assertEquals(t2, t) ; + Triple t = parseOneTriple("@prefix ex: <http://example/x/> . ex:s ex:p 123 . "); + Triple t2 = SSE.parseTriple("(<http://example/x/s> <http://example/x/p> 123)"); + assertEquals(t2, t); } // RDF 1.2 (some basic testing) @@ -205,13 +205,13 @@ public class TestLangTurtle @Test public void turtle_rdf12_01() { - Graph graph = parse(PREFIXES, "<< :s :p 123 >> . ") ; + Graph graph = parse(PREFIXES, "<< :s :p 123 >> . "); assertEquals(1, graph.size()); } @Test public void turtle_rdf12_02() { - Graph graph = parse(PREFIXES, "<< :s :p 123 >> :q 'abc' ") ; + Graph graph = parse(PREFIXES, "<< :s :p 123 >> :q 'abc' "); assertEquals(2, graph.size()); } @@ -235,42 +235,105 @@ public class TestLangTurtle @Test public void turtle_rdf12_11() { - Triple t = parseOneTriple("VERSION \"1.2\" <x:s> <x:p> 123 . ") ; + parseOneTriple("VERSION \"1.2\" <x:s> <x:p> 123 . "); } @Test public void turtle_rdf12_12() { - Triple t = parseOneTriple("VERSION '1.2' <x:s> <x:p> 123 . ") ; + parseOneTriple("VERSION '1.2' <x:s> <x:p> 123 . "); } @Test public void turtle_rdf12_13() { - Triple t = parseOneTriple("@version '1.2' . <x:s> <x:p> 123 . ") ; + parseOneTriple("@version '1.2' . <x:s> <x:p> 123 . "); } public void turtle_rdf12_14() { - Triple t = parseOneTriple("@version \"1.2\" . <x:s> <x:p> 123 . ") ; + parseOneTriple("@version \"1.2\" . <x:s> <x:p> 123 . "); } @Test (expected=ExFatal.class) public void turtle_rdf12_bad_11() { - Triple t = parseOneTriple("VERSION '1.2' . <x:s> <x:p> 123 . ") ; + parseOneTriple("VERSION '1.2' . <x:s> <x:p> 123 . "); } @Test (expected=ExFatal.class) public void turtle_rdf12_bad_12() { - Triple t = parseOneTriple("VERSION '''1.2''' <x:s> <x:p> 123 . ") ; + parseOneTriple("VERSION '''1.2''' <x:s> <x:p> 123 . "); } @Test (expected=ExFatal.class) public void turtle_rdf12_bad_13() { - Triple t = parseOneTriple("@version \"\"\"1.2\"\"\" <x:s> <x:p> 123 . ") ; + parseOneTriple("@version \"\"\"1.2\"\"\" <x:s> <x:p> 123 . "); + } + + // U+D800-U+DBFF is a high surrogate (first part of a pair) + // U+DC00-U+DFFF is a low surrogate (second part of a pair) + // so D800-DC00 is legal. + + @Test public void turtle_surrogate_1() { + // escaped high, escaped low + parseOneTriple("<x:s> <x:p> '\\ud800\\udc00' . "); + } + + @Test public void turtle_surrogate_2() { + // escaped high, raw low + parseOneTriple("<x:s> <x:p> '\\ud800\udc00' . "); + } + + // Compilation failure. (maven+openjdk - OK in Eclipse, and test correct) +// @Test public void turtle_surrogate_3() { +// // raw high, escaped low +// parseOneTriple("<x:s> <x:p> '\ud800\\udc00' . "); +// } + + @Test public void turtle_surrogate_4() { + // raw high, raw low + parseOneTriple("<x:s> <x:p> '\ud800\udc00' . "); + } + + @Test (expected=ExFatal.class) + public void turtle_bad_surrogate_1() { + parseOneTriple("<x:s> <x:p> '\\ud800' . "); + } + + @Test (expected=ExFatal.class) + public void turtle_bad_surrogate_2() { + parseOneTriple("<x:s> <x:p> '\\udfff' . "); + } + @Test (expected=ExFatal.class) + public void turtle_bad_surrogate_3() { + parseOneTriple("<x:s> <x:p> '\\U0000d800' . "); + } + + @Test (expected=ExFatal.class) + public void turtle_bad_surrogate_4() { + parseOneTriple("<x:s> <x:p> '\\U0000dfff' . "); + } + + @Test (expected=ExFatal.class) + public void turtle_bad_surrogate_5() { + // Wrong way round: low-high + parseOneTriple("<x:s> <x:p> '\\uc800\\ud800' . "); + } + + // Compilation failure. Can't write \ud800 +// @Test (expected=ExFatal.class) +// public void turtle_bad_surrogate_6() { +// // raw low - escaped high +// parseOneTriple("<x:s> <x:p> '\ud800\\ud800' . "); +// } + + @Test (expected=ExFatal.class) + public void turtle_bad_surrogate_7() { + // escaped low - raw high + parseOneTriple("<x:s> <x:p> '\\uc800\ud800' . "); } // No Formulae. Not trig. @Test (expected=ExFatal.class) - public void turtle_50() { parse("@prefix ex: <http://example/> . { ex:s ex:p 123 . } ") ; } + public void turtle_50() { parse("@prefix ex: <http://example/> . { ex:s ex:p 123 . } "); } @Test (expected=ExWarning.class) - public void turtle_60() { parse("@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . <x> <p> 'number'^^xsd:byte }") ; } + public void turtle_60() { parse("@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . <x> <p> 'number'^^xsd:byte }"); } } diff --git a/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java b/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java index b4c2865afb..bc2226a97a 100644 --- a/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java +++ b/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java @@ -1352,6 +1352,135 @@ public class TestTokenizerText { assertFalse(tokenizer.hasNext()); } + // U+D800-U+DBFF is a high surrogate (first part of a pair) + // U+DC00-U+DFFF is a low surrogate (second part of a pair) + // so D800-DC00 is legal. + + @Test public void turtle_surrogate_pair_01() { + // escaped high, escaped low + surrogate("'\\ud800\\udc00'"); + } + + @Test public void turtle_surrogate_pair_02() { + // escaped high, raw low + surrogate("'\\ud800\udc00'"); + } + + // Compilation failure - illegal escape character +// @Test public void turtle_surrogate_pair_03() { +// // raw high, escaped low +// surrogate("'\ud800\\udc00'"); +// } + + @Test public void turtle_surrogate_pair_04() { + // raw high, raw low + surrogate("'\ud800\udc00'"); + } + + @Test public void turtle_surrogate_pair_05() { + // escaped high, escaped low + surrogate("'a\\ud800\\udc00x'"); + } + + @Test public void turtle_surrogate_pair_06() { + // escaped high, raw low + surrogate("'z\\ud800\udc00'z"); + } + + // Compilation failure - illegal escape character +// @Test public void turtle_surrogate_pair_07() { +// // raw high, escaped low +// surrogate("'a\ud800\\udc00'z"); +// } + + @Test public void turtle_surrogate_pair_08() { + // raw high, raw low + surrogate("'a\ud800\udc00'z"); + } + + @Test (expected=RiotParseException.class) + public void turtle_bad_surrogate_01() { + surrogate("'\\ud800'"); + } + + @Test (expected=RiotParseException.class) + public void turtle_bad_surrogate_02() { + surrogate("'a\\ud800z'"); + } + + @Test (expected=RiotParseException.class) + public void turtle_bad_surrogate_03() { + surrogate("'\\udfff'"); + } + + @Test (expected=RiotParseException.class) + public void turtle_bad_surrogate_04() { + surrogate("'a\\udfffz'"); + } + + @Test (expected=RiotParseException.class) + public void turtle_bad_surrogate_05() { + surrogate("'\\U0000d800'"); + } + + @Test (expected=RiotParseException.class) + public void turtle_bad_surrogate_06() { + surrogate("'a\\U0000d800z'"); + } + + @Test (expected=RiotParseException.class) + public void turtle_bad_surrogate_07() { + surrogate("'\\U0000dfff'"); + } + + @Test (expected=RiotParseException.class) + public void turtle_bad_surrogate_08() { + surrogate("'a\\U0000dfffz'"); + } + + @Test (expected=RiotParseException.class) + public void turtle_bad_surrogate_09() { + // Wrong way round: low-high + surrogate("'\\uc800\\ud800'"); + } + + @Test (expected=RiotParseException.class) + public void turtle_bad_surrogate_10() { + // Wrong way round: low-high + surrogate("'a\\uc800\\ud800z'"); + } + + // Compilation failure - illegal escape character +// @Test (expected=RiotParseException.class) +// public void turtle_bad_surrogate_11() { +// // raw low - escaped high +// surrogate("'\ud800\\ud800'"); +// } +// +// @Test (expected=RiotParseException.class) +// public void turtle_bad_surrogate_12() { +// // raw low - escaped high +// surrogate("'a\ud800\\ud800z'"); +// } + + @Test (expected=RiotParseException.class) + public void turtle_bad_surrogate_13() { + // escaped low - raw high + surrogate("'\\uc800\ud800'"); + } + + @Test (expected=RiotParseException.class) + public void turtle_bad_surrogate_14() { + // escaped low - raw high + surrogate("'a\\uc800\ud800z'"); + } + + private void surrogate(String string) { + Tokenizer tokenizer = tokenizer(string); + tokenizer.hasNext(); + tokenizer.next(); + } + @Test public void token_rdf_star_reified_1() { Tokenizer tokenizer = tokenizer("<<"); diff --git a/jena-base/src/main/java/org/apache/jena/atlas/io/CharStream.java b/jena-base/src/main/java/org/apache/jena/atlas/io/CharStream.java index a5078e0d57..5f2e6babd2 100644 --- a/jena-base/src/main/java/org/apache/jena/atlas/io/CharStream.java +++ b/jena-base/src/main/java/org/apache/jena/atlas/io/CharStream.java @@ -21,7 +21,6 @@ package org.apache.jena.atlas.io; /** * A simplified reader interface without IOExceptions. - * It's an interface, not an abstract class */ public interface CharStream { diff --git a/jena-base/src/main/java/org/apache/jena/atlas/io/PeekReader.java b/jena-base/src/main/java/org/apache/jena/atlas/io/PeekReader.java index ae0efecaa5..1d24437d26 100644 --- a/jena-base/src/main/java/org/apache/jena/atlas/io/PeekReader.java +++ b/jena-base/src/main/java/org/apache/jena/atlas/io/PeekReader.java @@ -34,11 +34,8 @@ import org.apache.jena.atlas.lib.Chars; public final class PeekReader extends Reader { // Remember to apply fixes to PeekInputStream as well. - // Buffering is done by a CharStream - does it make difference? - // Yes. A lot (Java6). - - // Using a Reader here seems to have zero cost or benefit but CharStream - // allows fast String handling. + // Buffering is done by a CharStream - it makes a difference + // CharStream faster than a Reader. private final CharStream source; private static final int PUSHBACK_SIZE = 10; diff --git a/jena-tdb1/src/test/java/org/apache/jena/tdb1/store/nodetable/TestNodec.java b/jena-tdb1/src/test/java/org/apache/jena/tdb1/store/nodetable/TestNodec.java index 872b90fe0c..e62eed6b12 100644 --- a/jena-tdb1/src/test/java/org/apache/jena/tdb1/store/nodetable/TestNodec.java +++ b/jena-tdb1/src/test/java/org/apache/jena/tdb1/store/nodetable/TestNodec.java @@ -80,12 +80,8 @@ public class TestNodec @Test public void nodec_lit_22() { test ("''^^<>"); } // Bad Unicode. - static private final String binaryStr1 = "abc\uD800xyz"; // A single surrogate, without it's pair. - static private final String binaryStr2 = "\uD800"; // A single surrogate, without it's pair. - static private final String binaryStr3 = "\u0000"; // A zero character + static private final String binaryStr3 = "\u0000"; // A zero character - @Test public void nodec_lit_30() { test ("'"+binaryStr1+"'"); } - @Test public void nodec_lit_31() { test ("'"+binaryStr2+"'"); } @Test public void nodec_lit_32() { test ("'"+binaryStr3+"'"); } @Test public void nodec_lit_33() { test("'\uFFFD'"); }
