[ https://issues.apache.org/jira/browse/DRILL-8453?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17757382#comment-17757382 ]
ASF GitHub Bot commented on DRILL-8453: --------------------------------------- jnturton commented on code in PR #2824: URL: https://github.com/apache/drill/pull/2824#discussion_r1301478103 ########## contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/xsd/DrillXSDSchemaUtils.java: ########## @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.xml.xsd; + +import org.apache.drill.common.types.TypeProtos.MinorType; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap; +import org.apache.ws.commons.schema.XmlSchema; +import org.apache.ws.commons.schema.XmlSchemaCollection; +import org.apache.ws.commons.schema.XmlSchemaElement; + +import org.apache.ws.commons.schema.XmlSchemaObject; +import org.apache.ws.commons.schema.walker.XmlSchemaWalker; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.xml.transform.stream.StreamSource; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; + +public class DrillXSDSchemaUtils { + private static final MinorType DEFAULT_TYPE = MinorType.VARCHAR; + private static final Logger logger = LoggerFactory.getLogger(DrillXSDSchemaUtils.class); + + /** + * This map maps the data types defined by the XSD definition to Drill data types. + */ + public static final ImmutableMap<String, MinorType> XML_TYPE_MAPPINGS = ImmutableMap.<String, MinorType>builder() + .put("BASE64BINARY", MinorType.VARBINARY) + .put("BOOLEAN", MinorType.BIT) + .put("DATE", MinorType.DATE) + .put("DATETIME", MinorType.TIMESTAMP) + .put("DECIMAL", MinorType.VARDECIMAL) + .put("DOUBLE", MinorType.FLOAT8) + .put("DURATION", MinorType.INTERVAL) + .put("FLOAT", MinorType.FLOAT4) + .put("HEXBINARY", MinorType.VARBINARY) + .put("STRING", MinorType.VARCHAR) + .put("TIME", MinorType.TIME) + .build(); + + /** + * This function is only used for testing, but accepts a XSD file as input rather than a {@link InputStream} + * @param filename A {@link String} containing an XSD file. + * @return A {@link TupleMetadata} containing a Drill representation of the XSD schema. + * @throws IOException If anything goes wrong or the file is not found. + */ + public static TupleMetadata getSchema(String filename) throws IOException { Review Comment: ```suggestion @VisibleForTesting public static TupleMetadata getSchema(String filename) throws IOException { ``` ########## contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/xsd/TestXSDSchema.java: ########## @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.xml.xsd; + +import org.apache.drill.common.types.TypeProtos.MinorType; +import org.apache.drill.common.util.DrillFileUtils; +import org.apache.drill.exec.record.metadata.MapBuilder; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.junit.Test; + +import java.io.File; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestXSDSchema { + + @Test + public void testSimpleXSD() throws Exception { + File simple_xsd = DrillFileUtils.getResourceAsFile("/xsd/simple.xsd"); + TupleMetadata schema = DrillXSDSchemaUtils.getSchema(simple_xsd.getPath()); + + TupleMetadata expectedSchema = new SchemaBuilder() + .addMap("shiporder") + .addMap("attributes") + .addNullable("orderid", MinorType.VARCHAR) + .resumeMap() + .addNullable("orderperson", MinorType.VARCHAR) + .addMap("shipto") + .addNullable("name", MinorType.VARCHAR) + .addNullable("address", MinorType.VARCHAR) + .addNullable("city", MinorType.VARCHAR) + .addNullable("country", MinorType.VARCHAR) + .resumeMap() + .addMapArray("item") + .addNullable("title", MinorType.VARCHAR) + .addNullable("note", MinorType.VARCHAR) + .addNullable("quantity", MinorType.VARDECIMAL) + .addNullable("price", MinorType.VARDECIMAL) + .resumeMap() + .resumeSchema() + .buildSchema(); + assertTrue(expectedSchema.isEquivalent(schema)); + } + + + @Test + public void testComplexXSD() throws Exception { + File complex_xsd = DrillFileUtils.getResourceAsFile("/xsd/complex.xsd"); + TupleMetadata schema = DrillXSDSchemaUtils.getSchema(complex_xsd.getPath()); + + SchemaBuilder sb1 = new SchemaBuilder(); + MapBuilder sb2 = sb1 + .addNullable("comment", MinorType.VARCHAR) // global comment element + .addMap("infoType") + .addMap("attributes") + .addNullable("kind", MinorType.VARCHAR) + .resumeMap() + .resumeSchema() + .addMap("purchaseOrder") // global purchaseOrder element + .addMap("attributes") + .addNullable("orderDate", MinorType.DATE) // an attribute + .addNullable("confirmDate", MinorType.DATE) // an attribute + .resumeMap() + .addMap("shipTo") + .addMap("attributes") + .addNullable("country", MinorType.VARCHAR) // an attribute + .resumeMap() + .addNullable("name", MinorType.VARCHAR) + .addNullable("street", MinorType.VARCHAR) + .addNullable("city", MinorType.VARCHAR) + .addNullable("state", MinorType.VARCHAR) + .addNullable("zip", MinorType.VARDECIMAL) + .resumeMap(); // end shipTo + MapBuilder sb3 = sb2 + .addMap("billTo") + .addMap("attributes") + .addNullable("country", MinorType.VARCHAR) // an attribute + .resumeMap() + .addNullable("name", MinorType.VARCHAR) + .addNullable("street", MinorType.VARCHAR) + .addNullable("city", MinorType.VARCHAR) + .addNullable("state", MinorType.VARCHAR) + .addNullable("zip", MinorType.VARDECIMAL) + .resumeMap(); + MapBuilder sb4 = sb3 + .addNullable("comment", MinorType.VARCHAR) + .addMap("items") + .addMapArray("item") + .addMap("attributes") + .addNullable("partNum", MinorType.VARCHAR) // an attribute + .resumeMap() + .addNullable("productName", MinorType.VARCHAR) + .addNullable("quantity", MinorType.VARDECIMAL) + .addNullable("USPrice", MinorType.VARDECIMAL) + .addNullable("comment", MinorType.VARCHAR) + .addNullable("shipDate", MinorType.DATE) + .resumeMap() // end item + .resumeMap(); // end items + + TupleMetadata expectedSchema = sb4.resumeSchema().build(); + + System.out.println(schema); + System.out.println(expectedSchema); + Review Comment: Are these still meant to be present? ########## contrib/format-xml/src/test/resources/logback-test.xml: ########## @@ -0,0 +1,69 @@ +<?xml version="1.0" encoding="UTF-8" ?> +<!-- + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +--> +<configuration> + <if condition='property("drill.lilith.enable").equalsIgnoreCase("true")'> + <then> + <appender name="SOCKET" class="de.huxhorn.lilith.logback.appender.ClassicMultiplexSocketAppender"> + <Compressing>true</Compressing> + <ReconnectionDelay>10000</ReconnectionDelay> + <IncludeCallerData>true</IncludeCallerData> + <RemoteHosts>${LILITH_HOSTNAME:-localhost}</RemoteHosts> + </appender> Review Comment: I haven't seen this before - can you explain what it's for? > Add XSD Support to XML Reader (Part 1) > -------------------------------------- > > Key: DRILL-8453 > URL: https://issues.apache.org/jira/browse/DRILL-8453 > Project: Apache Drill > Issue Type: Improvement > Components: Format - XML > Affects Versions: 1.21.1 > Reporter: Charles Givre > Assignee: Charles Givre > Priority: Major > Fix For: 1.21.2 > > > This PR is a part of a series to add better support for reading XML data to > Drill. One of the main challenges is that XML data does not have a way of > inferring data types, nor does it have a way of detecting arrays. > The only way to do this really well is to have a schema. Some XML files link > a schema definition file to the data. This PR adds the capability for Drill > to map XSD schema files into Drill schemas. > The current plan is as follows: Part 1 of this PR simply adds the reader but > adds no new user detectable functionality. Part 2 will include the actual > integration with the XML reader. Part 3 will include the ability to read > arrays. -- This message was sent by Atlassian Jira (v8.20.10#820010)