Repository: jena
Updated Branches:
  refs/heads/master a5d3d915f -> 0cac294ec


JENA-911 : Strict RDF 1.1 URI tokens (no space, {, } ... in URI)


Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/0cac294e
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/0cac294e
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/0cac294e

Branch: refs/heads/master
Commit: 0cac294ece3aba6ff73e9d960499bf968bdf9ab4
Parents: a5d3d91
Author: Andy Seaborne <[email protected]>
Authored: Mon Apr 27 19:36:28 2015 +0100
Committer: Andy Seaborne <[email protected]>
Committed: Mon Apr 27 19:36:28 2015 +0100

----------------------------------------------------------------------
 .../java/org/apache/jena/atlas/lib/Chars.java   |  2 +
 .../org/apache/jena/riot/tokens/Tokenizer.java  |  1 -
 .../jena/riot/tokens/TokenizerFactory.java      | 40 +++++++-------
 .../apache/jena/riot/tokens/TokenizerText.java  | 56 ++++++++++----------
 .../apache/jena/riot/lang/TestLangNTuples.java  | 16 +++---
 .../org/apache/jena/riot/lang/TestLangTrig.java | 10 ++--
 .../apache/jena/riot/lang/TestLangTurtle.java   | 33 +++++-------
 .../apache/jena/riot/tokens/TestTokenizer.java  | 47 ++++++++++++++++
 8 files changed, 125 insertions(+), 80 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/0cac294e/jena-arq/src/main/java/org/apache/jena/atlas/lib/Chars.java
----------------------------------------------------------------------
diff --git a/jena-arq/src/main/java/org/apache/jena/atlas/lib/Chars.java 
b/jena-arq/src/main/java/org/apache/jena/atlas/lib/Chars.java
index ba301f8..38f4f88 100644
--- a/jena-arq/src/main/java/org/apache/jena/atlas/lib/Chars.java
+++ b/jena-arq/src/main/java/org/apache/jena/atlas/lib/Chars.java
@@ -183,6 +183,8 @@ public class Chars
     public static final int  UNSET           =  -2 ;
     public static final char NL              = '\n' ;
     public static final char CR              = '\r' ;
+    public static final char TAB             = '\t' ;
+    public static final char SPC             = ' ' ;
     public static final char BSPACE          = '\b' ;
     
     public static final char CH_ZERO         =  (char)0 ;

http://git-wip-us.apache.org/repos/asf/jena/blob/0cac294e/jena-arq/src/main/java/org/apache/jena/riot/tokens/Tokenizer.java
----------------------------------------------------------------------
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/tokens/Tokenizer.java 
b/jena-arq/src/main/java/org/apache/jena/riot/tokens/Tokenizer.java
index b5ffa00..b1cd5e6 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/Tokenizer.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/Tokenizer.java
@@ -22,7 +22,6 @@ import java.util.Iterator ;
 
 import org.apache.jena.atlas.lib.Closeable ;
 
-
 public interface Tokenizer extends Iterator<Token>, Closeable
 {
     /** Is there another token? */

http://git-wip-us.apache.org/repos/asf/jena/blob/0cac294e/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerFactory.java
----------------------------------------------------------------------
diff --git 
a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerFactory.java 
b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerFactory.java
index 3571c84..886bdbf 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerFactory.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerFactory.java
@@ -16,49 +16,53 @@
  * limitations under the License.
  */
 
-package org.apache.jena.riot.tokens;
+package org.apache.jena.riot.tokens ;
 
 import java.io.ByteArrayInputStream ;
 import java.io.InputStream ;
 import java.io.Reader ;
+import java.io.StringReader ;
 
 import org.apache.jena.atlas.io.PeekReader ;
 import org.apache.jena.atlas.lib.StrUtils ;
 
-public class TokenizerFactory
-{
-    /** Discouraged - be careful about character sets */ 
-    public static Tokenizer makeTokenizer(Reader reader)
-    {
+public class TokenizerFactory {
+    
+    /** Discouraged - be careful about character sets */
+    @Deprecated
+    public static Tokenizer makeTokenizer(Reader reader) {
         PeekReader peekReader = PeekReader.make(reader) ;
         Tokenizer tokenizer = new TokenizerText(peekReader) ;
         return tokenizer ;
     }
-    
-    public static Tokenizer makeTokenizerUTF8(InputStream in)
-    {
+
+    /** Discouraged - be careful about character sets */
+    public static Tokenizer makeTokenizer(StringReader reader) {
+        PeekReader peekReader = PeekReader.make(reader) ;
+        Tokenizer tokenizer = new TokenizerText(peekReader) ;
+        return tokenizer ;
+    }
+
+    public static Tokenizer makeTokenizerUTF8(InputStream in) {
         // BOM will be removed
         PeekReader peekReader = PeekReader.makeUTF8(in) ;
         Tokenizer tokenizer = new TokenizerText(peekReader) ;
         return tokenizer ;
     }
-   
-    public static Tokenizer makeTokenizerASCII(InputStream in)
-    {
+
+    public static Tokenizer makeTokenizerASCII(InputStream in) {
         PeekReader peekReader = PeekReader.makeASCII(in) ;
         Tokenizer tokenizer = new TokenizerText(peekReader) ;
         return tokenizer ;
     }
-    
-    public static Tokenizer makeTokenizerASCII(String string)
-    {
+
+    public static Tokenizer makeTokenizerASCII(String string) {
         byte b[] = StrUtils.asUTF8bytes(string) ;
         ByteArrayInputStream in = new ByteArrayInputStream(b) ;
         return makeTokenizerASCII(in) ;
     }
-    
-    public static Tokenizer makeTokenizerString(String str)
-    {
+
+    public static Tokenizer makeTokenizerString(String str) {
         PeekReader peekReader = PeekReader.readString(str) ;
         Tokenizer tokenizer = new TokenizerText(peekReader) ;
         return tokenizer ;

http://git-wip-us.apache.org/repos/asf/jena/blob/0cac294e/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
----------------------------------------------------------------------
diff --git 
a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java 
b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
index 5b03485..ba420bb 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
@@ -37,7 +37,7 @@ public final class TokenizerText implements Tokenizer
     // TODO Remove CNTL and make SYMBOLS
     // Drop through to final general symbol/keyword reader, including <=, != 
     // Care with <=
-    // STRING, not STIRNG1/2, LONG_STRING1,2
+    // STRING, not STRING1/2, LONG_STRING1/2
     // Policy driven for CURIES?
     
     // Various allow/deny options (via checker?)
@@ -432,36 +432,36 @@ public final class TokenizerText implements Tokenizer
         stringBuilder.setLength(0) ;
         for (;;) {
             int ch = reader.readChar() ;
-            if ( ch == EOF )
-                exception("Broken IRI (End of file): %s", 
stringBuilder.toString()) ;
-            if ( ch == '\n' )
-                exception("Broken IRI (newline): %s", 
stringBuilder.toString()) ;
-            if ( ch == '\r' )
-                exception("Broken IRI (CR): %s", stringBuilder.toString()) ;
-            if ( ch == CH_GT )
-                return stringBuilder.toString() ;
-            if ( ch == '\\' ) {
-                if ( VeryVeryLax )
-                    ch = readCharEscapeAnyURI() ;
-                else
-                    // NORMAL
-                    ch = readUnicodeEscape() ;
-                // Drop through.
+            switch(ch) {
+                case EOF:
+                    exception("Broken IRI (End of file): %s", 
stringBuilder.toString()) ;
+                case NL:
+                    exception("Broken IRI (newline): %s", 
stringBuilder.toString()) ;
+                case CR:
+                    exception("Broken IRI (CR): %s", stringBuilder.toString()) 
;
+                case CH_GT:
+                    // Done!
+                    return stringBuilder.toString() ;
+                case CH_RSLASH:
+                    if ( VeryVeryLax )
+                        ch = readCharEscapeAnyURI() ;
+                    else
+                        // NORMAL
+                        ch = readUnicodeEscape() ;
+                    break ;
             }
 
             if ( !VeryVeryLax ) {
-                // JENA-911
-//                if ( ch == 0x09 )
-//                    exception("Broken IRI (Tab character): %s", 
stringBuilder.toString()) ;
-//                if ( ch <= 0x19 )
-//                    exception("Broken IRI (control char 0x%02X): %s", ch, 
stringBuilder.toString()) ;
-//                if ( ch == 0x20 )
-//                    exception("Broken IRI (space): %s...", 
stringBuilder.toString()) ;
-//                if ( ch == '"' || ch == '{' || ch == '}' || ch == '|' || ch 
== '^' || ch == '`')
-//                    exception("Broken IRI (Illegal character 0x%02X, '%c'): 
%s", ch, (char)ch, stringBuilder.toString()) ;
-                // Ban certain very bad characters
-                if ( ch == '<' )
-                    exception("Broken IRI (bad character: '%c'): %s", ch, 
stringBuilder.toString()) ;
+                if ( ch == CH_LT )  // '<' -- very bad
+                    exception("Broken IRI (bad character: '%c'): %s", 
(char)ch, stringBuilder.toString()) ;
+                if ( ch == TAB )
+                    exception("Broken IRI (Tab character): %s", 
stringBuilder.toString()) ;
+                if ( ch <= 0x19 )
+                    exception("Broken IRI (control char 0x%02X): %s", ch, 
stringBuilder.toString()) ;
+                if ( ch == SPC )
+                    exception("Broken IRI (space): %s...", 
stringBuilder.toString()) ;
+                if ( ch == '"' || ch == '{' || ch == '}' || ch == '|' || ch == 
'^' || ch == '`')
+                    exception("Broken IRI (Illegal character 0x%02X, '%c'): 
%s", ch, (char)ch, stringBuilder.toString()) ;
             }
             insertCodepoint(stringBuilder, ch) ;
         }

http://git-wip-us.apache.org/repos/asf/jena/blob/0cac294e/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangNTuples.java
----------------------------------------------------------------------
diff --git 
a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangNTuples.java 
b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangNTuples.java
index d1de6e3..a401195 100644
--- a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangNTuples.java
+++ b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangNTuples.java
@@ -123,22 +123,22 @@ abstract public class TestLangNTuples extends BaseTest
     }
     
     // Bad terms - but accepted by default.
-    @Test 
+    @Test(expected=ExFatal.class)
     public void tuple_bad_10()       { parseCount("<x> <p> <bad uri> .") ; } 
 
-    // Bad terms - but accepted by default.
+    // Bad terms (value range) - but legal syntax 
     @Test 
     public void tuple_bad_11()       { parseCount("<x> <p> 
\"9000\"^^<http://www.w3.org/2001/XMLSchema#byte> .") ; } 
 
-    // Bad terms - but accepted by default.
-    @Test (expected=ExError.class)
+    // Bad - relative URI.
+    @Test(expected=ExError.class)
     public void tuple_bad_21()       { parseCheck("<x> <p> <z> .") ; } 
 
-    // Bad terms - with checking.
-    @Test (expected=ExWarning.class)
-    public void tuple_bad_22()       { parseCheck("<http://example/x> 
<http://example/p> <http://example/bad uri> .") ; } 
+    // Bad terms
+    @Test(expected=ExFatal.class)
+    public void tuple_bad_22()       { parseCheck("<http://example/x> 
<http://example/p> \"abc\"^^<http://example/bad uri> .") ; } 
 
-    @Test  (expected=ExWarning.class)
+    @Test(expected=ExWarning.class)
     public void tuple_bad_23()       { parseCheck("<http://example/x> 
<http://example/p> \"9000\"^^<http://www.w3.org/2001/XMLSchema#byte> .") ; } 
     
     // ASCII vs UTF-8

http://git-wip-us.apache.org/repos/asf/jena/blob/0cac294e/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTrig.java
----------------------------------------------------------------------
diff --git a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTrig.java 
b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTrig.java
index 5b47f42..bb04e29 100644
--- a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTrig.java
+++ b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTrig.java
@@ -21,10 +21,10 @@ package org.apache.jena.riot.lang;
 import org.apache.jena.atlas.junit.BaseTest ;
 import org.apache.jena.atlas.lib.StrUtils ;
 import org.apache.jena.graph.Triple ;
-import org.apache.jena.riot.ErrorHandlerTestLib ;
+import org.apache.jena.riot.* ;
 import org.apache.jena.riot.ErrorHandlerTestLib.ErrorHandlerEx ;
+import org.apache.jena.riot.ErrorHandlerTestLib.ExFatal ;
 import org.apache.jena.riot.ErrorHandlerTestLib.ExWarning ;
-import org.apache.jena.riot.RiotReader ;
 import org.apache.jena.riot.system.StreamRDF ;
 import org.apache.jena.riot.system.StreamRDFLib ;
 import org.apache.jena.riot.tokens.Tokenizer ;
@@ -72,13 +72,13 @@ public class TestLangTrig extends BaseTest
     // Also need to check that the RiotExpection is called in normal use. 
     
     // Bad terms.
-    @Test (expected=ExWarning.class)
+    @Test (expected=ExFatal.class)
     public void trig_20()     { parse("@prefix ex:  <bad iri> .", "{ ex:s ex:p 
123 }") ; }
     
-    @Test (expected=ExWarning.class)
+    @Test (expected=ExFatal.class)
     public void trig_21()     { parse("@prefix ex:  <http://example/> .", "{ 
ex:s <http://example/broken p> 123 }") ; }
     
-    @Test (expected=ExWarning.class)
+    @Test (expected=ExFatal.class)
     public void trig_22()     { parse("{ <x> <p> 'number'^^<bad uri> }") ; }
 
     @Test (expected=ExWarning.class)

http://git-wip-us.apache.org/repos/asf/jena/blob/0cac294e/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java
----------------------------------------------------------------------
diff --git 
a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java 
b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java
index 2c59a58..be1bdc0 100644
--- a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java
+++ b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java
@@ -22,7 +22,6 @@ import static 
org.apache.jena.riot.system.ErrorHandlerFactory.errorHandlerNoLogg
 import static 
org.apache.jena.riot.system.ErrorHandlerFactory.getDefaultErrorHandler ;
 import static 
org.apache.jena.riot.system.ErrorHandlerFactory.setDefaultErrorHandler ;
 
-import java.io.Reader ;
 import java.io.StringReader ;
 
 import org.apache.jena.atlas.junit.BaseTest ;
@@ -135,7 +134,7 @@ public class TestLangTurtle extends BaseTest
     private static Graph parse(String ...strings)
     {
         String string = StrUtils.strjoin("\n", strings) ;
-        Reader reader = new StringReader(string) ;
+        StringReader reader = new StringReader(string) ;
         String baseIRI = "http://base/"; ;
         Tokenizer tokenizer = TokenizerFactory.makeTokenizer(reader) ;
         
@@ -182,7 +181,7 @@ public class TestLangTurtle extends BaseTest
     @Test(expected=ExFatal.class)
     public void errorBadDatatype()          { parse("<p> <p> 'q'^^.") ; }
     
-    @Test(expected=ExWarning.class)
+    @Test(expected=RiotException.class)
     public void errorBadURI_1()
     { parse("<http://example/a b> <http://example/p> 123 .") ; }
 
@@ -195,25 +194,29 @@ public class TestLangTurtle extends BaseTest
     public void errorBadURI_3()
     { parse("<http://example/a%Aab> <http://example/p> 123 .") ; }
 
+    // Bad URIs
+    @Test (expected=ExFatal.class)
+    public void errorBadURI_4()     { parse("@prefix ex:  <bad iri> .  ex:s 
ex:p 123 ") ; }
+    
+    @Test (expected=ExFatal.class)
+    public void errorBadURI_5()     { parse("<x> <p> 'number'^^<bad uri> ") ; }
+    
     @Test
-    public void turtle_01()         
-    { 
+    public void turtle_01() {
         Triple t = parseOneTriple("<s> <p> 123 . ") ;
         Triple t2 = SSE.parseTriple("(<http://base/s> <http://base/p> 123)") ;
         assertEquals(t2, t) ;
     }
 
     @Test
-    public void turtle_02()         
-    { 
+    public void turtle_02() {
         Triple t = parseOneTriple("@base <http://example/> . <s> <p> 123 . ") ;
         Triple t2 = SSE.parseTriple("(<http://example/s> <http://example/p> 
123)") ;
         assertEquals(t2, t) ;
     }
 
     @Test
-    public void turtle_03()         
-    { 
+    public void turtle_03() {
         Triple t = parseOneTriple("@prefix ex: <http://example/x/> . ex:s ex:p 
123 . ") ;
         Triple t2 = SSE.parseTriple("(<http://example/x/s> 
<http://example/x/p> 123)") ;
         assertEquals(t2, t) ;
@@ -223,16 +226,6 @@ public class TestLangTurtle extends BaseTest
     @Test (expected=ExFatal.class)
     public void turtle_10()     { parse("@prefix ex:  <http://example/> .  { 
ex:s ex:p 123 . } ") ; }
     
-    // Bad terms.
-    @Test (expected=ExWarning.class)
-    public void turtle_20()     { parse("@prefix ex:  <bad iri> .  ex:s ex:p 
123 ") ; }
-    
-    @Test (expected=ExWarning.class)
-    public void turtle_21()     { parse("@prefix ex:  <http://example/> . ex:s 
<http://example/broken p> 123") ; }
-    
-    @Test (expected=ExWarning.class)
-    public void turtle_22()     { parse("<x> <p> 'number'^^<bad uri> ") ; }
-
     @Test (expected=ExWarning.class)
-    public void turtle_23()     { parse("@prefix xsd:  
<http://www.w3.org/2001/XMLSchema#> . <x> <p> 'number'^^xsd:byte }") ; }
+    public void turtle_20()     { parse("@prefix xsd:  
<http://www.w3.org/2001/XMLSchema#> . <x> <p> 'number'^^xsd:byte }") ; }
 }

http://git-wip-us.apache.org/repos/asf/jena/blob/0cac294e/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizer.java
----------------------------------------------------------------------
diff --git 
a/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizer.java 
b/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizer.java
index 89b9cb8..a85c861 100644
--- a/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizer.java
+++ b/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizer.java
@@ -23,6 +23,7 @@ import java.io.ByteArrayInputStream ;
 import org.apache.jena.atlas.io.PeekReader ;
 import org.apache.jena.atlas.junit.BaseTest ;
 import org.apache.jena.atlas.lib.StrUtils ;
+import org.apache.jena.riot.RiotException ;
 import org.apache.jena.riot.RiotParseException ;
 import org.apache.jena.sparql.ARQConstants ;
 import org.junit.Test ;
@@ -152,6 +153,52 @@ public class TestTokenizer extends BaseTest {
         tokenizeAndTestFirst("<abc\\u0041def>   123", TokenType.IRI, 
"abcAdef") ;
     }
 
+    // Bad IRIs
+    @Test(expected=RiotException.class)
+    public void tokenUnit_iri10() {
+        tokenFirst("<abc def>") ;
+    }
+
+    @Test(expected=RiotException.class)
+    public void tokenUnit_iri11() {
+        tokenFirst("<abc<def>") ;
+    }
+
+    @Test(expected=RiotException.class)
+    public void tokenUnit_iri12() {
+        tokenFirst("<abc{def>") ;
+    }
+    
+    @Test(expected=RiotException.class)
+    public void tokenUnit_iri13() {
+        tokenFirst("<abc}def>") ;
+    }
+    
+    @Test(expected=RiotException.class)
+    public void tokenUnit_iri14() {
+        tokenFirst("<abc|def>") ;
+    }
+    
+    @Test(expected=RiotException.class)
+    public void tokenUnit_iri15() {
+        tokenFirst("<abc^def>") ;
+    }
+
+    @Test(expected=RiotException.class)
+    public void tokenUnit_iri16() {
+        tokenFirst("<abc`def>") ;
+    }
+    
+    @Test(expected=RiotException.class)
+    public void tokenUnit_iri17() {
+        tokenFirst("<abc\tdef>") ;  // Java escae - real tab
+    }
+
+    @Test(expected=RiotException.class)
+    public void tokenUnit_iri18() {
+        tokenFirst("<abc\u0007def>") ;  // Java escape - codepoint 7 
+    }
+
     @Test
     public void tokenUnit_str1() {
         tokenizeAndTestExact("   'abc'   ", TokenType.STRING1, "abc") ;

Reply via email to