Is there a reason that Jena does not support the full range of XML name
start characters?
see https://www.w3.org/TR/xml/#NT-NameStartChar
I wrote a quick test and found that there were a number of characters that
Jena does not support.
Miscategorization appears to start at 0x132. There are 936990
miscategorized characters.
The issue is actually in the Xerces util class XMLChar
Is this because of the version of Xerces we are stuck with? Is there a
way
around this issue?
Claude
p.s. Since I can't attach a file, here is the test code I wrote.
import static org.junit.Assert.assertTrue;
import org.apache.xerces.util.XMLChar;
import org.junit.Test;
public class NameTest {
/*
* NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] |
[#xD8-#xF6] |
* [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] |
* [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] |
[#xF900-#xFDCF] |
* [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
*/
int[][] ranges = { { ':', ':' }, { 'A', 'Z' }, { '_', '_' }, { 0xC0,
0xD6 }, { 0xD8, 0xF6 }, { 0xF8, 0x2FF },
{ 0x370, 0x37D }, { 0x37F, 0x1FFF }, { 0x200C, 0x200D }, {
0x2070, 0x218F }, { 0x2C00, 0x2FEF },
{ 0x3001, 0xD7FF }, { 0xF900, 0xFDCF }, { 0xFDF0, 0xFFFD }, {
0x10000, 0xEFFFF } };
@Test
public void testNameStart() {
for (int[] range : ranges) {
for (int c = range[0]; c <= range[1]; c++) {
assertTrue( String.format( "character %s
0x%s",c,Integer.toHexString( c )) , XMLChar.isNameStart( c ) );
}
}
}
@Test
public void listNameStartErr() {
int cnt = 0;
for (int[] range : ranges) {
for (int c = range[0]; c <= range[1]; c++) {
if (!XMLChar.isNameStart( c ))
{
System.out.print( String.format( "0x%s
",Integer.toHexString( c )) );
cnt++;
if (cnt % 25 == 0)
{
System.out.println();
}
}
}
}
System.out.println();
System.out.println( cnt+" characters miscategorized" );
}
}