This is an automated email from the ASF dual-hosted git repository.
cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push:
new dda5bc4680 DRILL-8493: Drill Unable to Read XML Files with Namespaces
(#2908)
dda5bc4680 is described below
commit dda5bc46808bcc361bd471584d1ee9f04bdc2db6
Author: Charles S. Givre <[email protected]>
AuthorDate: Sat Apr 27 21:55:29 2024 -0400
DRILL-8493: Drill Unable to Read XML Files with Namespaces (#2908)
---
.../org/apache/drill/exec/store/xml/XMLReader.java | 13 +++++--
.../apache/drill/exec/store/xml/TestXMLReader.java | 24 ++++++++++++
.../format-xml/src/test/resources/xml/sitemap.xml | 45 ++++++++++++++++++++++
3 files changed, 78 insertions(+), 4 deletions(-)
diff --git
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
index 1d85851946..d985cd69d8 100644
---
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
+++
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
@@ -82,6 +82,7 @@ public class XMLReader implements Closeable {
private XMLEventReader reader;
private ImplicitColumns metadata;
private boolean isSelfClosingEvent;
+ private Iterator<Attribute> rootAttributeIterator;
/**
* This field indicates the various states in which the reader operates. The
names should be self-explanatory,
@@ -103,6 +104,11 @@ public class XMLReader implements Closeable {
// This property prevents XXE attacks by disallowing DTD.
inputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
+
+ // When reading some documents with XML Namespaces, Drill seems to ignore
the rest of the
+ // document. Setting this parameter to false solves this issue. However,
when we introduce
+ // XSD support, it will likely be necessary to make this a configurable
parameter.
+ inputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false);
reader = inputFactory.createXMLEventReader(fsStream);
fieldNameStack = new Stack<>();
rowWriterStack = new Stack<>();
@@ -340,7 +346,6 @@ public class XMLReader implements Closeable {
// Get the field value
fieldValue = currentEvent.asCharacters().getData().trim();
changeState(xmlState.GETTING_DATA);
- changeState(xmlState.GETTING_DATA);
break;
case XMLStreamConstants.END_ELEMENT:
@@ -367,11 +372,11 @@ public class XMLReader implements Closeable {
} else if (currentState == xmlState.FIELD_ENDED && currentNestingLevel
>= dataLevel) {
// Case to end nested maps
// Pop tupleWriter off stack
- if (rowWriterStack.size() > 0) {
+ if (!rowWriterStack.isEmpty()) {
currentTupleWriter = rowWriterStack.pop();
}
// Pop field name
- if (fieldNameStack.size() > 0) {
+ if (!fieldNameStack.isEmpty()) {
fieldNameStack.pop();
}
@@ -385,7 +390,7 @@ public class XMLReader implements Closeable {
attributePrefix = XMLUtils.removeField(attributePrefix, fieldName);
// Pop field name
- if (fieldNameStack.size() > 0) {
+ if (!fieldNameStack.isEmpty()) {
fieldNameStack.pop();
}
fieldName = null;
diff --git
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
index 1283337410..f6c924f0a7 100644
---
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
+++
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
@@ -117,6 +117,30 @@ public class TestXMLReader extends ClusterTest {
new RowSetComparison(expected).verifyAndClearAll(results);
}
+ @Test
+ public void testAttributesOnRootWithNamespace() throws Exception {
+ String sql = "SELECT * FROM table(cp.`xml/sitemap.xml` (type => 'xml',
dataLevel => 2))";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .add("attributes", MinorType.MAP, DataMode.REQUIRED)
+ .addNullable("loc", MinorType.VARCHAR)
+ .addNullable("lastmod", MinorType.VARCHAR)
+ .addNullable("changefreq", MinorType.VARCHAR)
+ .addNullable("priority", MinorType.VARCHAR)
+ .build();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow(mapArray(),
"https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ1.xml",
"2024-03-28T00:10:00.074Z", "monthly", "1.0")
+ .addRow(mapArray(),
"https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ2.xml",
"2023-06-20T23:44:00.215Z", "monthly", "1.0")
+ .addRow(mapArray(),
"https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ3.xml",
"2023-07-03T14:32:01.529Z", "monthly", "1.0")
+ .build();
+
+ assertEquals(3, results.rowCount());
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+
@Test
public void testXXE() throws Exception {
String sql = "SELECT * FROM cp.`xml/bad.xml`";
diff --git a/contrib/format-xml/src/test/resources/xml/sitemap.xml
b/contrib/format-xml/src/test/resources/xml/sitemap.xml
new file mode 100644
index 0000000000..1225e6bb5d
--- /dev/null
+++ b/contrib/format-xml/src/test/resources/xml/sitemap.xml
@@ -0,0 +1,45 @@
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
+ <url>
+ <loc>
+https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ1.xml
+</loc>
+ <lastmod>2024-03-28T00:10:00.074Z</lastmod>
+ <changefreq>monthly</changefreq>
+ <priority>1.0</priority>
+ </url>
+ <url>
+ <loc>
+https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ2.xml
+</loc>
+ <lastmod>2023-06-20T23:44:00.215Z</lastmod>
+ <changefreq>monthly</changefreq>
+ <priority>1.0</priority>
+ </url>
+ <url>
+ <loc>
+https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ3.xml
+</loc>
+ <lastmod>2023-07-03T14:32:01.529Z</lastmod>
+ <changefreq>monthly</changefreq>
+ <priority>1.0</priority>
+ </url>
+</urlset>