[
https://issues.apache.org/jira/browse/DRILL-8453?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17757382#comment-17757382
]
ASF GitHub Bot commented on DRILL-8453:
---------------------------------------
jnturton commented on code in PR #2824:
URL: https://github.com/apache/drill/pull/2824#discussion_r1301478103
##########
contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/xsd/DrillXSDSchemaUtils.java:
##########
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.xml.xsd;
+
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.apache.ws.commons.schema.XmlSchema;
+import org.apache.ws.commons.schema.XmlSchemaCollection;
+import org.apache.ws.commons.schema.XmlSchemaElement;
+
+import org.apache.ws.commons.schema.XmlSchemaObject;
+import org.apache.ws.commons.schema.walker.XmlSchemaWalker;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.xml.transform.stream.StreamSource;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+
+public class DrillXSDSchemaUtils {
+ private static final MinorType DEFAULT_TYPE = MinorType.VARCHAR;
+ private static final Logger logger =
LoggerFactory.getLogger(DrillXSDSchemaUtils.class);
+
+ /**
+ * This map maps the data types defined by the XSD definition to Drill data
types.
+ */
+ public static final ImmutableMap<String, MinorType> XML_TYPE_MAPPINGS =
ImmutableMap.<String, MinorType>builder()
+ .put("BASE64BINARY", MinorType.VARBINARY)
+ .put("BOOLEAN", MinorType.BIT)
+ .put("DATE", MinorType.DATE)
+ .put("DATETIME", MinorType.TIMESTAMP)
+ .put("DECIMAL", MinorType.VARDECIMAL)
+ .put("DOUBLE", MinorType.FLOAT8)
+ .put("DURATION", MinorType.INTERVAL)
+ .put("FLOAT", MinorType.FLOAT4)
+ .put("HEXBINARY", MinorType.VARBINARY)
+ .put("STRING", MinorType.VARCHAR)
+ .put("TIME", MinorType.TIME)
+ .build();
+
+ /**
+ * This function is only used for testing, but accepts a XSD file as input
rather than a {@link InputStream}
+ * @param filename A {@link String} containing an XSD file.
+ * @return A {@link TupleMetadata} containing a Drill representation of the
XSD schema.
+ * @throws IOException If anything goes wrong or the file is not found.
+ */
+ public static TupleMetadata getSchema(String filename) throws IOException {
Review Comment:
```suggestion
@VisibleForTesting
public static TupleMetadata getSchema(String filename) throws IOException {
```
##########
contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/xsd/TestXSDSchema.java:
##########
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.xml.xsd;
+
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.common.util.DrillFileUtils;
+import org.apache.drill.exec.record.metadata.MapBuilder;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.junit.Test;
+
+import java.io.File;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class TestXSDSchema {
+
+ @Test
+ public void testSimpleXSD() throws Exception {
+ File simple_xsd = DrillFileUtils.getResourceAsFile("/xsd/simple.xsd");
+ TupleMetadata schema = DrillXSDSchemaUtils.getSchema(simple_xsd.getPath());
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addMap("shiporder")
+ .addMap("attributes")
+ .addNullable("orderid", MinorType.VARCHAR)
+ .resumeMap()
+ .addNullable("orderperson", MinorType.VARCHAR)
+ .addMap("shipto")
+ .addNullable("name", MinorType.VARCHAR)
+ .addNullable("address", MinorType.VARCHAR)
+ .addNullable("city", MinorType.VARCHAR)
+ .addNullable("country", MinorType.VARCHAR)
+ .resumeMap()
+ .addMapArray("item")
+ .addNullable("title", MinorType.VARCHAR)
+ .addNullable("note", MinorType.VARCHAR)
+ .addNullable("quantity", MinorType.VARDECIMAL)
+ .addNullable("price", MinorType.VARDECIMAL)
+ .resumeMap()
+ .resumeSchema()
+ .buildSchema();
+ assertTrue(expectedSchema.isEquivalent(schema));
+ }
+
+
+ @Test
+ public void testComplexXSD() throws Exception {
+ File complex_xsd = DrillFileUtils.getResourceAsFile("/xsd/complex.xsd");
+ TupleMetadata schema =
DrillXSDSchemaUtils.getSchema(complex_xsd.getPath());
+
+ SchemaBuilder sb1 = new SchemaBuilder();
+ MapBuilder sb2 = sb1
+ .addNullable("comment", MinorType.VARCHAR) // global comment element
+ .addMap("infoType")
+ .addMap("attributes")
+ .addNullable("kind", MinorType.VARCHAR)
+ .resumeMap()
+ .resumeSchema()
+ .addMap("purchaseOrder") // global purchaseOrder element
+ .addMap("attributes")
+ .addNullable("orderDate", MinorType.DATE) // an attribute
+ .addNullable("confirmDate", MinorType.DATE) // an attribute
+ .resumeMap()
+ .addMap("shipTo")
+ .addMap("attributes")
+ .addNullable("country", MinorType.VARCHAR) // an attribute
+ .resumeMap()
+ .addNullable("name", MinorType.VARCHAR)
+ .addNullable("street", MinorType.VARCHAR)
+ .addNullable("city", MinorType.VARCHAR)
+ .addNullable("state", MinorType.VARCHAR)
+ .addNullable("zip", MinorType.VARDECIMAL)
+ .resumeMap(); // end shipTo
+ MapBuilder sb3 = sb2
+ .addMap("billTo")
+ .addMap("attributes")
+ .addNullable("country", MinorType.VARCHAR) // an attribute
+ .resumeMap()
+ .addNullable("name", MinorType.VARCHAR)
+ .addNullable("street", MinorType.VARCHAR)
+ .addNullable("city", MinorType.VARCHAR)
+ .addNullable("state", MinorType.VARCHAR)
+ .addNullable("zip", MinorType.VARDECIMAL)
+ .resumeMap();
+ MapBuilder sb4 = sb3
+ .addNullable("comment", MinorType.VARCHAR)
+ .addMap("items")
+ .addMapArray("item")
+ .addMap("attributes")
+ .addNullable("partNum", MinorType.VARCHAR) // an attribute
+ .resumeMap()
+ .addNullable("productName", MinorType.VARCHAR)
+ .addNullable("quantity", MinorType.VARDECIMAL)
+ .addNullable("USPrice", MinorType.VARDECIMAL)
+ .addNullable("comment", MinorType.VARCHAR)
+ .addNullable("shipDate", MinorType.DATE)
+ .resumeMap() // end item
+ .resumeMap(); // end items
+
+ TupleMetadata expectedSchema = sb4.resumeSchema().build();
+
+ System.out.println(schema);
+ System.out.println(expectedSchema);
+
Review Comment:
Are these still meant to be present?
##########
contrib/format-xml/src/test/resources/logback-test.xml:
##########
@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+<configuration>
+ <if condition='property("drill.lilith.enable").equalsIgnoreCase("true")'>
+ <then>
+ <appender name="SOCKET"
class="de.huxhorn.lilith.logback.appender.ClassicMultiplexSocketAppender">
+ <Compressing>true</Compressing>
+ <ReconnectionDelay>10000</ReconnectionDelay>
+ <IncludeCallerData>true</IncludeCallerData>
+ <RemoteHosts>${LILITH_HOSTNAME:-localhost}</RemoteHosts>
+ </appender>
Review Comment:
I haven't seen this before - can you explain what it's for?
> Add XSD Support to XML Reader (Part 1)
> --------------------------------------
>
> Key: DRILL-8453
> URL: https://issues.apache.org/jira/browse/DRILL-8453
> Project: Apache Drill
> Issue Type: Improvement
> Components: Format - XML
> Affects Versions: 1.21.1
> Reporter: Charles Givre
> Assignee: Charles Givre
> Priority: Major
> Fix For: 1.21.2
>
>
> This PR is a part of a series to add better support for reading XML data to
> Drill. One of the main challenges is that XML data does not have a way of
> inferring data types, nor does it have a way of detecting arrays.
> The only way to do this really well is to have a schema. Some XML files link
> a schema definition file to the data. This PR adds the capability for Drill
> to map XSD schema files into Drill schemas.
> The current plan is as follows: Part 1 of this PR simply adds the reader but
> adds no new user detectable functionality. Part 2 will include the actual
> integration with the XML reader. Part 3 will include the ability to read
> arrays.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)