[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17863373#comment-17863373 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on code in PR #2909: URL: https://github.com/apache/drill/pull/2909#discussion_r1666968774 ## contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java: ## @@ -0,0 +1,250 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.daffodil; + +import org.apache.drill.categories.RowSetTest; +import org.apache.drill.common.types.TypeProtos.MinorType; +import org.apache.drill.exec.physical.rowSet.RowSet; +import org.apache.drill.exec.physical.rowSet.RowSetReader; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.apache.drill.test.ClusterFixture; +import org.apache.drill.test.ClusterTest; +import org.apache.drill.test.QueryBuilder; +import org.apache.drill.test.rowSet.RowSetComparison; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.nio.file.Paths; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; + +@Category(RowSetTest.class) +public class TestDaffodilReader extends ClusterTest { + + String schemaURIRoot = "file:///opt/drill/contrib/format-daffodil/src/test/resources/"; Review Comment: What, exactly, do I change this to, if I want to retrieve files from $DRILL_CONFIG_DIR/lib ? ## contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java: ## @@ -0,0 +1,250 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.daffodil; + +import org.apache.drill.categories.RowSetTest; +import org.apache.drill.common.types.TypeProtos.MinorType; +import org.apache.drill.exec.physical.rowSet.RowSet; +import org.apache.drill.exec.physical.rowSet.RowSetReader; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.apache.drill.test.ClusterFixture; +import org.apache.drill.test.ClusterTest; +import org.apache.drill.test.QueryBuilder; +import org.apache.drill.test.rowSet.RowSetComparison; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.nio.file.Paths; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; + +@Category(RowSetTest.class) +public class TestDaffodilReader extends ClusterTest { + + String schemaURIRoot = "file:///opt/drill/contrib/format-daffodil/src/test/resources/"; + + @BeforeClass + public static void setup() throws Exception { +// boilerplate call to start test rig +ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); + +DaffodilFormatConfig formatConfig = new DaffodilFormatConfig(null, "", "", "", false); + +cluster.defineFormat("dfs", "daffodil", formatConfig); + +// Needed to test against compressed files. +// Copies data from src/test/resources to the dfs root. +dirTestWatcher.copyResourceToRoot(Paths.get("data/")); +dirTestWatcher.copyResourceToRoot(Paths.get("schema/")); + } + + private String selectRow(String schema,
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17845113#comment-17845113 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on code in PR #2909: URL: https://github.com/apache/drill/pull/2909#discussion_r1595903385 ## contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java: ## @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.daffodil.schema; + +import org.apache.daffodil.japi.Compiler; +import org.apache.daffodil.japi.Daffodil; +import org.apache.daffodil.japi.DataProcessor; +import org.apache.daffodil.japi.Diagnostic; +import org.apache.daffodil.japi.InvalidParserException; +import org.apache.daffodil.japi.InvalidUsageException; +import org.apache.daffodil.japi.ProcessorFactory; +import org.apache.daffodil.japi.ValidationMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.channels.Channels; +import java.util.List; +import java.util.Objects; + +/** + * Compiles a DFDL schema (mostly for tests) or loads a pre-compiled DFDL schema so that one can + * obtain a DataProcessor for use with DaffodilMessageParser. + * + * TODO: Needs to use a cache to avoid reloading/recompiling every time. + */ +public class DaffodilDataProcessorFactory { + // Default constructor is used. + + private static final Logger logger = LoggerFactory.getLogger(DaffodilDataProcessorFactory.class); + + private DataProcessor dp; + + /** + * Gets a Daffodil DataProcessor given the necessary arguments to compile or reload it. + * + * @param schemaFileURI + * pre-compiled dfdl schema (.bin extension) or DFDL schema source (.xsd extension) + * @param validationMode + * Use true to request Daffodil built-in 'limited' validation. Use false for no validation. + * @param rootName + * Local name of root element of the message. Can be null to use the first element declaration + * of the primary schema file. Ignored if reloading a pre-compiled schema. + * @param rootNS + * Namespace URI as a string. Can be null to use the target namespace of the primary schema + * file or if it is unambiguous what element is the rootName. Ignored if reloading a + * pre-compiled schema. + * @return the DataProcessor + * @throws CompileFailure + * - if schema compilation fails + */ + public DataProcessor getDataProcessor(URI schemaFileURI, boolean validationMode, String rootName, + String rootNS) + throws CompileFailure { + +DaffodilDataProcessorFactory dmp = new DaffodilDataProcessorFactory(); +boolean isPrecompiled = schemaFileURI.toString().endsWith(".bin"); +if (isPrecompiled) { + if (Objects.nonNull(rootName) && !rootName.isEmpty()) { +// A usage error. You shouldn't supply the name and optionally namespace if loading +// precompiled schema because those are built into it. Should be null or "". +logger.warn("Root element name '{}' is ignored when used with precompiled DFDL schema.", +rootName); + } + try { +dmp.loadSchema(schemaFileURI); + } catch (IOException | InvalidParserException e) { +throw new CompileFailure(e); + } + dmp.setupDP(validationMode, null); +} else { + List pfDiags; + try { +pfDiags = dmp.compileSchema(schemaFileURI, rootName, rootNS); + } catch (URISyntaxException | IOException e) { +throw new CompileFailure(e); + } + dmp.setupDP(validationMode, pfDiags); +} +return dmp.dp; + } + + private void loadSchema(URI schemaFileURI) throws IOException, InvalidParserException { +Compiler c = Daffodil.compiler(); +dp = c.reload(Channels.newChannel(schemaFileURI.toURL().openStream())); + } + + private List compileSchema(URI schemaFileURI, String rootName, String rootNS) + throws URISyntaxException, IOException, CompileFailure { +Compiler c =
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17843832#comment-17843832 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on PR #2909: URL: https://github.com/apache/drill/pull/2909#issuecomment-209976 > Hi Mike, Are you free at all this week? My apologies... We're in the middle of putting an offer on a house and my life is very hectic at the moment. Best, > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17843601#comment-17843601 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on PR #2909: URL: https://github.com/apache/drill/pull/2909#issuecomment-2095044801 Hi Mike, Are you free at all this week? My apologies... We're in the middle of putting an offer on a house and my life is very hectic at the moment. Best, > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841807#comment-17841807 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on PR #2909: URL: https://github.com/apache/drill/pull/2909#issuecomment-2081781546 Tests are now failing due to these two things in TestDaffodilReader.scala ``` String schemaURIRoot = "file:///opt/drill/contrib/format-daffodil/src/test/resources/"; ``` That's an absolute URI that is used to obtain access to the schema files in this statement: ``` private String selectRow(String schema, String file) { return "SELECT * FROM table(dfs.`data/" + file + "` " + " (type => 'daffodil'," + " " + "validationMode => 'true', " + " schemaURI => '" + schemaURIRoot + "schema/" + schema + ".dfdl.xsd'," + " rootName => 'row'," + " rootNamespace => null " + "))"; } ``` This is assembling a select statement, and puts this absolute schemaURI into the schemaURI part of the select. What should I be doing to arrange for these schema URIs to be found. The schemas are a large complex set of files, not just a single file. Many files must be found relative to the initial root schema file. (Hundreds of files potentially). As they include/import other schema files using relative paths. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841775#comment-17841775 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on code in PR #2909: URL: https://github.com/apache/drill/pull/2909#discussion_r1582375084 ## exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java: ## @@ -185,6 +192,26 @@ public MapBuilder resumeMap() { return (MapBuilder) parent; } + /** + * Depending on whether the parent is a schema builder or map builder + * we resume appropriately. + */ + @Override + public void resume() { +if (Objects.isNull(parent)) Review Comment: I just built Drill using the following command: ```sh mvn clean install -DskipTests ``` When I did that, I was getting the same error as on GitHub. After adding the braces as described above, it built without issues. With that said, I think you can do just run the check style with: ```sh mvn checkstyle:checkstyle ``` > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841774#comment-17841774 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on code in PR #2909: URL: https://github.com/apache/drill/pull/2909#discussion_r1582375084 ## exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java: ## @@ -185,6 +192,26 @@ public MapBuilder resumeMap() { return (MapBuilder) parent; } + /** + * Depending on whether the parent is a schema builder or map builder + * we resume appropriately. + */ + @Override + public void resume() { +if (Objects.isNull(parent)) Review Comment: I just built Drill using the following command: ```sh mvn clean install -DskipTests ``` I think you can do just run the check style with: ```sh mvn checkstyle:checkstyle ``` > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841768#comment-17841768 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on code in PR #2909: URL: https://github.com/apache/drill/pull/2909#discussion_r1582367382 ## exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java: ## @@ -185,6 +192,26 @@ public MapBuilder resumeMap() { return (MapBuilder) parent; } + /** + * Depending on whether the parent is a schema builder or map builder + * we resume appropriately. + */ + @Override + public void resume() { +if (Objects.isNull(parent)) Review Comment: What is the maven command line to just make it run this checkstyle? > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841667#comment-17841667 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on code in PR #2909: URL: https://github.com/apache/drill/pull/2909#discussion_r1582206247 ## exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java: ## @@ -185,6 +192,26 @@ public MapBuilder resumeMap() { return (MapBuilder) parent; } + /** + * Depending on whether the parent is a schema builder or map builder + * we resume appropriately. + */ + @Override + public void resume() { +if (Objects.isNull(parent)) Review Comment: @mbeckerle Confirmed. I successfully built your branch by adding the aforementioned braces. I'll save you some additional trouble. There's another check style violation in `DaffodilBatchReader`. Drill doesn't like star imports for some reason. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841663#comment-17841663 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on code in PR #2909: URL: https://github.com/apache/drill/pull/2909#discussion_r1582202511 ## exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java: ## @@ -185,6 +192,26 @@ public MapBuilder resumeMap() { return (MapBuilder) parent; } + /** + * Depending on whether the parent is a schema builder or map builder + * we resume appropriately. + */ + @Override + public void resume() { +if (Objects.isNull(parent)) Review Comment: @mbeckerle I don't know why the checkstyle is telling you the wrong file, but here, you'll need braces as well as at line 203. ie: ```java if (parent instanceof MapBuilder) { resumeMap(); } ``` > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841637#comment-17841637 ] ASF GitHub Bot commented on DRILL-8474: --- shfshihuafeng commented on PR #2909: URL: https://github.com/apache/drill/pull/2909#issuecomment-2081475418 > This fails its tests due to a maven checkstyle failure. It's complaining about Drill:Exec:Vectors, which my code has no changes to. > > Can someone advise on what is wrong here? if (Objects.isNull(parent)) { throw new IllegalStateException("Call to resume() on MapBuilder with no parent."); } > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841636#comment-17841636 ] ASF GitHub Bot commented on DRILL-8474: --- shfshihuafeng commented on PR #2909: URL: https://github.com/apache/drill/pull/2909#issuecomment-2081475241 > This fails its tests due to a maven checkstyle failure. It's complaining about Drill:Exec:Vectors, which my code has no changes to. > > Can someone advise on what is wrong here? /home/runner/work/drill/drill/exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java:201:5 you need add if' construct must use '{}',like following ? if (Objects.isNull(parent)) { throw new IllegalStateException("Call to resume() on MapBuilder with no parent."); } > This fails its tests due to a maven checkstyle failure. It's complaining about Drill:Exec:Vectors, which my code has no changes to. > > Can someone advise on what is wrong here? exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java 201 i think you need add {} for if ``` if (Objects.isNull(parent)) { throw new IllegalStateException("Call to resume() on MapBuilder with no parent."); } ``` > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841537#comment-17841537 ] Mike Beckerle commented on DRILL-8474: -- PR for this ticket is now https://github.com/apache/drill/pull/2909 > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841530#comment-17841530 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle closed pull request #2836: DRILL-8474: Add Daffodil Format Plugin URL: https://github.com/apache/drill/pull/2836 > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841531#comment-17841531 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on PR #2836: URL: https://github.com/apache/drill/pull/2836#issuecomment-2081176156 Creating a new squashed PR so as to avoid loss of the comments on this PR. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841528#comment-17841528 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on PR #2836: URL: https://github.com/apache/drill/pull/2836#issuecomment-2081164073 This now passes all the daffodil contrib tests using the published official Daffodil 3.7.0. It does not yet run in any scalable fashion, but the metadata/data interfacing is complete. I would like to squash this to a single commit before merging, and it needs to be tested rebased onto the latest Drill commit. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17810092#comment-17810092 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on PR #2836: URL: https://github.com/apache/drill/pull/2836#issuecomment-1906827568 Ok, so the geo-ip UDF stuff has no special mechanisms or description about those resource files, so the generic code that "scans" must find them and drag them along automatically. That's the behavior I want. What is "Drill's 3rd Party Jar folder"? If a magic folder just gets dragged over to all nodes, and drill uses a class loader that arranges for jars in that folder to be searched, then there is very little to do, since a DFDL schema can be just a set of jar files containing related resources, and the classes for Daffodil's own UDFs and layers which are java code extensions of its own kind. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17810070#comment-17810070 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on PR #2836: URL: https://github.com/apache/drill/pull/2836#issuecomment-1906689793 > > > @cgivre @paul-rogers is there an example of a Drill UDF that is not part of the drill repository tree? > > > I'd like to understand the mechanisms for distributing any jar files and dependencies of the UDF that drill uses. I can't find any such in the quasi-USFs that are in the Drill tree, because well, since they are part of Drill, and so are their dependencies, this problem doesn't exist. > > > > > > @mbeckerle Here's an example: https://github.com/datadistillr/drill-humanname-functions. I'm sorry we weren't able to connect last week. > > If I understand this correctly, if a jar is on the classpath and has drill-module.conf in its root dir, then drill will find it and read that HOCON file to get the package to add to drill.classpath.scanning.packages. I believe that is correct. > > Drill then appears to scan jars for class files for those packages. Not sure what it is doing with the class files. I imagine it is repackaging them somehow so Drill can use them on the drill distributed nodes. But it isn't yet clear to me how this aspect works. Do these classes just get loaded on the distributed drill nodes? Or is the classpath augmented in some way on the drill nodes so that they see a jar that contains all these classes? > > I have two questions: > > (1) what about dependencies? The UDF may depend on libraries which depend on other libraries, etc. So UDFs are a bit of a special case, but if they do have dependencies, you have to also include those JAR files in the UDF directory, or in Drill's 3rd party JAR folder. I'm not that good with maven, but I've often wondered about making a so-called fat-JAR which includes the dependencies as part of the UDF JAR file. > > (2) what about non-class files, e.g., things under src/main/resources of the project that go into the jar, but aren't "class" files? How do those things also get moved? How would code running in the drill node access these? The usual method is to call getResource(URL) with a URL that gives the path within a jar file to the resource in question. Take a look at this UDF. https://github.com/datadistillr/drill-geoip-functions This UDF has a few external resources including a CSV file and the MaxMind databases. > > Thanks for any info. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17810051#comment-17810051 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on PR #2836: URL: https://github.com/apache/drill/pull/2836#issuecomment-1906561549 > > @cgivre @paul-rogers is there an example of a Drill UDF that is not part of the drill repository tree? > > I'd like to understand the mechanisms for distributing any jar files and dependencies of the UDF that drill uses. I can't find any such in the quasi-USFs that are in the Drill tree, because well, since they are part of Drill, and so are their dependencies, this problem doesn't exist. > > @mbeckerle Here's an example: https://github.com/datadistillr/drill-humanname-functions. I'm sorry we weren't able to connect last week. If I understand this correctly, if a jar is on the classpath and has drill-module.conf in its root dir, then drill will find it and read that HOCON file to get the package to add to drill.classpath.scanning.packages. Drill then appears to scan jars for class files for those packages. Not sure what it is doing with the class files. I imagine it is repackaging them somehow so Drill can use them on the drill distributed nodes. But it isn't yet clear to me how this aspect works. Do these classes just get loaded on the distributed drill nodes? Or is the classpath augmented in some way on the drill nodes so that they see a jar that contains all these classes? I have two questions: (1) what about dependencies? The UDF may depend on libraries which depend on other libraries, etc. (2) what about non-class files, e.g., things under src/main/resources of the project that go into the jar, but aren't "class" files? How do those things also get moved? How would code running in the drill node access these? The usual method is to call getResource(URL) with a URL that gives the path within a jar file to the resource in question. Thanks for any info. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17809174#comment-17809174 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on PR #2836: URL: https://github.com/apache/drill/pull/2836#issuecomment-1902751729 > @cgivre @paul-rogers is there an example of a Drill UDF that is not part of the drill repository tree? > > I'd like to understand the mechanisms for distributing any jar files and dependencies of the UDF that drill uses. I can't find any such in the quasi-USFs that are in the Drill tree, because well, since they are part of Drill, and so are their dependencies, this problem doesn't exist. @mbeckerle Here's an example: https://github.com/datadistillr/drill-humanname-functions.I'm sorry we weren't able to connect last week. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17809173#comment-17809173 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on PR #2836: URL: https://github.com/apache/drill/pull/2836#issuecomment-1902750285 @cgivre @paul-rogers is there an example of a Drill UDF that is not part of the drill repository tree? I'd like to understand the mechanisms for distributing any jar files and dependencies of the UDF that drill uses. I can't find any such in the quasi-USFs that are in the Drill tree, because well, since they are part of Drill, and so are their dependencies, this problem doesn't exist. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17809172#comment-17809172 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on code in PR #2836: URL: https://github.com/apache/drill/pull/2836#discussion_r1461099077 ## contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java: ## @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.daffodil.schema; + +import org.apache.daffodil.runtime1.api.ChoiceMetadata; +import org.apache.daffodil.runtime1.api.ComplexElementMetadata; +import org.apache.daffodil.runtime1.api.ElementMetadata; +import org.apache.daffodil.runtime1.api.InfosetSimpleElement; +import org.apache.daffodil.runtime1.api.MetadataHandler; +import org.apache.daffodil.runtime1.api.SequenceMetadata; +import org.apache.daffodil.runtime1.api.SimpleElementMetadata; +import org.apache.drill.common.types.TypeProtos.MinorType; +import org.apache.drill.exec.record.metadata.MapBuilder; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Stack; + +/** + * This class transforms a DFDL/Daffodil schema into a Drill Schema. + */ +public class DrillDaffodilSchemaVisitor extends MetadataHandler { + private static final Logger logger = LoggerFactory.getLogger(DrillDaffodilSchemaVisitor.class); + /** + * Unfortunately, SchemaBuilder and MapBuilder, while similar, do not share a base class so we + * have a stack of MapBuilders, and when empty we use the SchemaBuilder Review Comment: This is fixed in the latest commit. Created MapBuilderLike interface shared by SchemaBuilder and MapBuilder. I only populated it with the methods I needed. The corresponding problem doesn't really occur in the rowWriter area as tupleWriter is the common underlying class used. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17807233#comment-17807233 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on code in PR #2836: URL: https://github.com/apache/drill/pull/2836#discussion_r1453422371 ## contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java: ## @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.daffodil; + +import org.apache.daffodil.japi.DataProcessor; +import org.apache.drill.common.AutoCloseables; +import org.apache.drill.common.exceptions.CustomErrorContext; +import org.apache.drill.common.exceptions.UserException; +import org.apache.drill.exec.physical.impl.scan.v3.ManagedReader; +import org.apache.drill.exec.physical.impl.scan.v3.file.FileDescrip; +import org.apache.drill.exec.physical.impl.scan.v3.file.FileSchemaNegotiator; +import org.apache.drill.exec.physical.resultSet.RowSetLoader; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.apache.drill.exec.store.daffodil.schema.DaffodilDataProcessorFactory; +import org.apache.drill.exec.store.dfs.DrillFileSystem; +import org.apache.drill.exec.store.dfs.easy.EasySubScan; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Objects; + +import static org.apache.drill.exec.store.daffodil.schema.DaffodilDataProcessorFactory.*; +import static org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils.daffodilDataProcessorToDrillSchema; + +public class DaffodilBatchReader implements ManagedReader { + + private static final Logger logger = LoggerFactory.getLogger(DaffodilBatchReader.class); + private final RowSetLoader rowSetLoader; + private final CustomErrorContext errorContext; + private final DaffodilMessageParser dafParser; + private final InputStream dataInputStream; + + public DaffodilBatchReader(DaffodilReaderConfig readerConfig, EasySubScan scan, + FileSchemaNegotiator negotiator) { + +errorContext = negotiator.parentErrorContext(); +DaffodilFormatConfig dafConfig = readerConfig.plugin.getConfig(); + +String schemaURIString = dafConfig.getSchemaURI(); // "schema/complexArray1.dfdl.xsd"; +String rootName = dafConfig.getRootName(); +String rootNamespace = dafConfig.getRootNamespace(); +boolean validationMode = dafConfig.getValidationMode(); + +URI dfdlSchemaURI; +try { + dfdlSchemaURI = new URI(schemaURIString); +} catch (URISyntaxException e) { + throw UserException.validationError(e).build(logger); +} + +FileDescrip file = negotiator.file(); +DrillFileSystem fs = file.fileSystem(); +URI fsSchemaURI = fs.getUri().resolve(dfdlSchemaURI); + +DaffodilDataProcessorFactory dpf = new DaffodilDataProcessorFactory(); +DataProcessor dp; +try { + dp = dpf.getDataProcessor(fsSchemaURI, validationMode, rootName, rootNamespace); +} catch (CompileFailure e) { + throw UserException.dataReadError(e) + .message(String.format("Failed to get Daffodil DFDL processor for: %s", fsSchemaURI)) + .addContext(errorContext).addContext(e.getMessage()).build(logger); +} +// Create the corresponding Drill schema. +// Note: this could be a very large schema. Think of a large complex RDBMS schema, +// all of it, hundreds of tables, but all part of the same metadata tree. +TupleMetadata drillSchema = daffodilDataProcessorToDrillSchema(dp); +// Inform Drill about the schema +negotiator.tableSchema(drillSchema, true); + +// +// DATA TIME: Next we construct the runtime objects, and open files. +// +// We get the DaffodilMessageParser, which is a stateful driver for daffodil that +// actually does the parsing. +rowSetLoader = negotiator.build().writer(); + +// We construct the Daffodil InfosetOutputter which the daffodil parser uses to +// convert infoset event
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17806487#comment-17806487 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on PR #2836: URL: https://github.com/apache/drill/pull/2836#issuecomment-1890990577 > > @mbeckerle With respect to style, I tried to reply to that comment, but the thread won't let me. In any event, Drill classes will typically start with the constructor, then have whatever methods are appropriate for the class. The logger creation usually happens before the constructor. I think all of your other classes followed this format, so the one or two that didn't kind of jumped out at me. > > @cgivre I believe the style issues are all fixed. The build did not get any codestyle issues. The issue I was referring to was more around the organization of a few classes. Usually we'll have the constructor (if present) at the top followed by any class methods. I think there was a class or two where the constructor was at the bottom or something like that. In any event, consider the issue resolved. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17806486#comment-17806486 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on code in PR #2836: URL: https://github.com/apache/drill/pull/2836#discussion_r1451758017 ## contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java: ## @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.daffodil; + +import org.apache.daffodil.japi.DataProcessor; +import org.apache.drill.common.AutoCloseables; +import org.apache.drill.common.exceptions.CustomErrorContext; +import org.apache.drill.common.exceptions.UserException; +import org.apache.drill.exec.physical.impl.scan.v3.ManagedReader; +import org.apache.drill.exec.physical.impl.scan.v3.file.FileDescrip; +import org.apache.drill.exec.physical.impl.scan.v3.file.FileSchemaNegotiator; +import org.apache.drill.exec.physical.resultSet.RowSetLoader; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.apache.drill.exec.store.daffodil.schema.DaffodilDataProcessorFactory; +import org.apache.drill.exec.store.dfs.DrillFileSystem; +import org.apache.drill.exec.store.dfs.easy.EasySubScan; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Objects; + +import static org.apache.drill.exec.store.daffodil.schema.DaffodilDataProcessorFactory.*; +import static org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils.daffodilDataProcessorToDrillSchema; + +public class DaffodilBatchReader implements ManagedReader { + + private static final Logger logger = LoggerFactory.getLogger(DaffodilBatchReader.class); + private final RowSetLoader rowSetLoader; + private final CustomErrorContext errorContext; + private final DaffodilMessageParser dafParser; + private final InputStream dataInputStream; + + public DaffodilBatchReader(DaffodilReaderConfig readerConfig, EasySubScan scan, + FileSchemaNegotiator negotiator) { + +errorContext = negotiator.parentErrorContext(); +DaffodilFormatConfig dafConfig = readerConfig.plugin.getConfig(); + +String schemaURIString = dafConfig.getSchemaURI(); // "schema/complexArray1.dfdl.xsd"; +String rootName = dafConfig.getRootName(); +String rootNamespace = dafConfig.getRootNamespace(); +boolean validationMode = dafConfig.getValidationMode(); + +URI dfdlSchemaURI; +try { + dfdlSchemaURI = new URI(schemaURIString); +} catch (URISyntaxException e) { + throw UserException.validationError(e).build(logger); +} + +FileDescrip file = negotiator.file(); +DrillFileSystem fs = file.fileSystem(); +URI fsSchemaURI = fs.getUri().resolve(dfdlSchemaURI); + +DaffodilDataProcessorFactory dpf = new DaffodilDataProcessorFactory(); +DataProcessor dp; +try { + dp = dpf.getDataProcessor(fsSchemaURI, validationMode, rootName, rootNamespace); +} catch (CompileFailure e) { + throw UserException.dataReadError(e) + .message(String.format("Failed to get Daffodil DFDL processor for: %s", fsSchemaURI)) + .addContext(errorContext).addContext(e.getMessage()).build(logger); +} +// Create the corresponding Drill schema. +// Note: this could be a very large schema. Think of a large complex RDBMS schema, +// all of it, hundreds of tables, but all part of the same metadata tree. +TupleMetadata drillSchema = daffodilDataProcessorToDrillSchema(dp); +// Inform Drill about the schema +negotiator.tableSchema(drillSchema, true); + +// +// DATA TIME: Next we construct the runtime objects, and open files. +// +// We get the DaffodilMessageParser, which is a stateful driver for daffodil that +// actually does the parsing. +rowSetLoader = negotiator.build().writer(); + +// We construct the Daffodil InfosetOutputter which the daffodil parser uses to +// convert infoset event
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17806484#comment-17806484 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on code in PR #2836: URL: https://github.com/apache/drill/pull/2836#discussion_r1451757410 ## contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java: ## @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.daffodil.schema; + +import org.apache.daffodil.runtime1.api.ChoiceMetadata; +import org.apache.daffodil.runtime1.api.ComplexElementMetadata; +import org.apache.daffodil.runtime1.api.ElementMetadata; +import org.apache.daffodil.runtime1.api.InfosetSimpleElement; +import org.apache.daffodil.runtime1.api.MetadataHandler; +import org.apache.daffodil.runtime1.api.SequenceMetadata; +import org.apache.daffodil.runtime1.api.SimpleElementMetadata; +import org.apache.drill.common.types.TypeProtos.MinorType; +import org.apache.drill.exec.record.metadata.MapBuilder; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Stack; + +/** + * This class transforms a DFDL/Daffodil schema into a Drill Schema. + */ +public class DrillDaffodilSchemaVisitor extends MetadataHandler { + private static final Logger logger = LoggerFactory.getLogger(DrillDaffodilSchemaVisitor.class); + /** + * Unfortunately, SchemaBuilder and MapBuilder, while similar, do not share a base class so we + * have a stack of MapBuilders, and when empty we use the SchemaBuilder Review Comment: This is likely music to @paul-rogers's ears. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17806482#comment-17806482 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on code in PR #2836: URL: https://github.com/apache/drill/pull/2836#discussion_r1451756763 ## contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java: ## @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.daffodil.schema; + +import org.apache.daffodil.japi.Compiler; +import org.apache.daffodil.japi.Daffodil; +import org.apache.daffodil.japi.DataProcessor; +import org.apache.daffodil.japi.Diagnostic; +import org.apache.daffodil.japi.InvalidParserException; +import org.apache.daffodil.japi.InvalidUsageException; +import org.apache.daffodil.japi.ProcessorFactory; +import org.apache.daffodil.japi.ValidationMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.channels.Channels; +import java.util.List; +import java.util.Objects; + +/** + * Compiles a DFDL schema (mostly for tests) or loads a pre-compiled DFDL schema so that one can + * obtain a DataProcessor for use with DaffodilMessageParser. + * + * TODO: Needs to use a cache to avoid reloading/recompiling every time. + */ +public class DaffodilDataProcessorFactory { + // Default constructor is used. + + private static final Logger logger = LoggerFactory.getLogger(DaffodilDataProcessorFactory.class); + + private DataProcessor dp; + + /** + * Gets a Daffodil DataProcessor given the necessary arguments to compile or reload it. + * + * @param schemaFileURI + * pre-compiled dfdl schema (.bin extension) or DFDL schema source (.xsd extension) + * @param validationMode + * Use true to request Daffodil built-in 'limited' validation. Use false for no validation. + * @param rootName + * Local name of root element of the message. Can be null to use the first element declaration + * of the primary schema file. Ignored if reloading a pre-compiled schema. + * @param rootNS + * Namespace URI as a string. Can be null to use the target namespace of the primary schema + * file or if it is unambiguous what element is the rootName. Ignored if reloading a + * pre-compiled schema. + * @return the DataProcessor + * @throws CompileFailure + * - if schema compilation fails + */ + public DataProcessor getDataProcessor(URI schemaFileURI, boolean validationMode, String rootName, + String rootNS) + throws CompileFailure { + +DaffodilDataProcessorFactory dmp = new DaffodilDataProcessorFactory(); +boolean isPrecompiled = schemaFileURI.toString().endsWith(".bin"); +if (isPrecompiled) { + if (Objects.nonNull(rootName) && !rootName.isEmpty()) { +// A usage error. You shouldn't supply the name and optionally namespace if loading +// precompiled schema because those are built into it. Should be null or "". +logger.warn("Root element name '{}' is ignored when used with precompiled DFDL schema.", +rootName); + } + try { +dmp.loadSchema(schemaFileURI); + } catch (IOException | InvalidParserException e) { +throw new CompileFailure(e); + } + dmp.setupDP(validationMode, null); +} else { + List pfDiags; + try { +pfDiags = dmp.compileSchema(schemaFileURI, rootName, rootNS); + } catch (URISyntaxException | IOException e) { +throw new CompileFailure(e); + } + dmp.setupDP(validationMode, pfDiags); +} +return dmp.dp; + } + + private void loadSchema(URI schemaFileURI) throws IOException, InvalidParserException { +Compiler c = Daffodil.compiler(); +dp = c.reload(Channels.newChannel(schemaFileURI.toURL().openStream())); Review Comment: This definitely seems like an area where there is potential for a lot of different things to go wrong. My view is we should just do our best to provide clear error
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17806481#comment-17806481 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on code in PR #2836: URL: https://github.com/apache/drill/pull/2836#discussion_r1451756527 ## contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java: ## @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.daffodil.schema; + +import org.apache.daffodil.japi.Compiler; +import org.apache.daffodil.japi.Daffodil; +import org.apache.daffodil.japi.DataProcessor; +import org.apache.daffodil.japi.Diagnostic; +import org.apache.daffodil.japi.InvalidParserException; +import org.apache.daffodil.japi.InvalidUsageException; +import org.apache.daffodil.japi.ProcessorFactory; +import org.apache.daffodil.japi.ValidationMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.channels.Channels; +import java.util.List; +import java.util.Objects; + +/** + * Compiles a DFDL schema (mostly for tests) or loads a pre-compiled DFDL schema so that one can + * obtain a DataProcessor for use with DaffodilMessageParser. + * + * TODO: Needs to use a cache to avoid reloading/recompiling every time. + */ +public class DaffodilDataProcessorFactory { + // Default constructor is used. + + private static final Logger logger = LoggerFactory.getLogger(DaffodilDataProcessorFactory.class); + + private DataProcessor dp; + + /** + * Gets a Daffodil DataProcessor given the necessary arguments to compile or reload it. + * + * @param schemaFileURI + * pre-compiled dfdl schema (.bin extension) or DFDL schema source (.xsd extension) + * @param validationMode + * Use true to request Daffodil built-in 'limited' validation. Use false for no validation. + * @param rootName + * Local name of root element of the message. Can be null to use the first element declaration + * of the primary schema file. Ignored if reloading a pre-compiled schema. + * @param rootNS + * Namespace URI as a string. Can be null to use the target namespace of the primary schema + * file or if it is unambiguous what element is the rootName. Ignored if reloading a + * pre-compiled schema. + * @return the DataProcessor + * @throws CompileFailure + * - if schema compilation fails + */ + public DataProcessor getDataProcessor(URI schemaFileURI, boolean validationMode, String rootName, + String rootNS) + throws CompileFailure { + +DaffodilDataProcessorFactory dmp = new DaffodilDataProcessorFactory(); +boolean isPrecompiled = schemaFileURI.toString().endsWith(".bin"); +if (isPrecompiled) { + if (Objects.nonNull(rootName) && !rootName.isEmpty()) { +// A usage error. You shouldn't supply the name and optionally namespace if loading +// precompiled schema because those are built into it. Should be null or "". +logger.warn("Root element name '{}' is ignored when used with precompiled DFDL schema.", +rootName); + } + try { +dmp.loadSchema(schemaFileURI); + } catch (IOException | InvalidParserException e) { +throw new CompileFailure(e); Review Comment: My thought here would be to fail as quickly as possible. If the DFDL schema can't be read, I'm assuming that we cannot proceed, so throwing an exception would be the right thing to do IMHO. With that said, we should make sure we provide a good error message that would explain what went wrong. One of the issues we worked on for a while with Drill was that it would fail and you'd get a stack trace w/o a clear idea of what the actual issue is and how to rectify it. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17804917#comment-17804917 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on PR #2836: URL: https://github.com/apache/drill/pull/2836#issuecomment-1883962208 > @mbeckerle With respect to style, I tried to reply to that comment, but the thread won't let me. In any event, Drill classes will typically start with the constructor, then have whatever methods are appropriate for the class. The logger creation usually happens before the constructor. I think all of your other classes followed this format, so the one or two that didn't kind of jumped out at me. @cgivre I believe the style issues are all fixed. The build did not get any codestyle issues. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803984#comment-17803984 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on PR #2836: URL: https://github.com/apache/drill/pull/2836#issuecomment-1880110452 > > @mbeckerle I had a thought about your TODO list. See inline. > > > This is ready for a next review. All the scalar types are now implemented with typed setter calls. > > > The prior review comments have all been addressed I believe. > > > Remaining things to do include: > > > > > > 1. How to get the compiled DFDL schema object so it can be loaded by daffodil out at the distributed Drill nodes. > > > > > > I was thinking about this and I remembered something that might be useful. Drill has support for User Defined Functions (UDF) which are written in Java. To add a UDF to Drill, you also have to write some Java classes in a particular way, and include the JARs. Much like the DFDL class files, the UDF JARs must be accessible to all nodes of a Drill cluster. > > Additionally, Drill has the capability of adding UDFs dynamically. This feature was added here: #574. Anyway, I wonder if we could use a similar mechanism to load and store the DFDL files so that they are accessible to all Drill nodes. What do you think? > > Excellent: So drill has all the machinery, it's just a question of repackaging it so it's available for this usage pattern, which is a bit different from Drill's UDFs, but also very similar. > > There are two user scenarios which we can call production and test. > > 1. Production: binary compiled DFDL schema file + code jars for Daffodil's own UDFs and "layers" plugins. This should, ideally, cache the compiled schema and not reload it for every query (at every node), but keep the same loaded instance in memory in a persistant JVM image on each node. For large production DFDL schemas this is the only sensible mechanism as it can take minutes to compile large DFDL schemas. > 2. Test: on-the-fly centralized compilation of DFDL schema (from a combination of jars and files) to create and cache (to avoid recompiling) the binary compiled DFDL schema file. Then using that compiled binary file, as item 1. For small DFDL schemas this can be fast enough for production use. Ideally, if the DFDL schema is unchanged this would reuse the compiled binary file, but that's an optimization that may not matter much. > > Kinds of objects involved are: > > * Daffodil plugin code jars > * DFDL schema jars > * DFDL schema files (just not packaged into a jar) > * Daffodil compiled schema binary file > * Daffodil config file - parameters, tunables, and options needed at compile time and/or runtime > > Code jars: Daffodil provides two extension features for DFDL users - DFDL UDFs and DFDL 'layers' (ex: plug-ins for uudecode, or gunzip algorithms used in part of the data format). Those are ordinary compiled class files in jars, so in all scenarios those jars are needed on the node class path if the DFDL schema uses them. Daffodil dynamically finds and loads these from the classpath in regular Java Service-Provider Interface (SPI) mechanisms. > > Schema jars: Daffodil packages DFDL schema files (source files i.e., mySchema.dfdl.xsd) into jar files to allow inter-schema dependencies to be managed using ordinary jar/java-style managed dependencies. Tools like sbt and maven can express the dependencies of one schema on another, grab and pull them together, etc. Daffodil has a resolver so when one schema file referenes another with include/import it searches the class path directories and jars for the files. > > Schema jars are only needed centrally when compiling the schema to a binary file. All references to the jar files for inter-schema file references are compiled into the compiled binary file. > > It is possible for one DFDL schema 'project' to define a DFDL schema, along with the code for a plugin like a Daffodil UDF or layer. In that case the one jar created is both a code jar and a schema jar. The schema jar aspects are used when the schema is compiled and ignored at Daffodil runtime. The code jar aspects are used at Daffodil run time and ignored at schema compilation time. So such a jar that is both code and schema jar needs to be on the class path in both places, but there's no interaction of the two things. > > Binary Compiled Schema File: Centrally, DFDL schemas in files and/or jars are compiled to create a single binary object which can be reloaded in order to actually use the schema to parse/unparse data. > > * These binary files are tied to a specific version+build of Daffodil. (They are just a java object serialization of the runtime data structures used by Daffodil). > * Once reloaded into a JVM to
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803983#comment-17803983 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on PR #2836: URL: https://github.com/apache/drill/pull/2836#issuecomment-1880109717 @mbeckerle With respect to style, I tried to reply to that comment, but the thread won't let me. In any event, Drill classes will typically start with the constructor, then have whatever methods are appropriate for the class. The logger creation usually happens before the constructor. I think all of your other classes followed this format, so the one or two that didn't kind of jumped out at me. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803594#comment-17803594 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on code in PR #2836: URL: https://github.com/apache/drill/pull/2836#discussion_r1442993784 ## contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java: ## @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.daffodil.schema; + +import org.apache.daffodil.runtime1.api.ChoiceMetadata; +import org.apache.daffodil.runtime1.api.ComplexElementMetadata; +import org.apache.daffodil.runtime1.api.ElementMetadata; +import org.apache.daffodil.runtime1.api.InfosetSimpleElement; +import org.apache.daffodil.runtime1.api.MetadataHandler; +import org.apache.daffodil.runtime1.api.SequenceMetadata; +import org.apache.daffodil.runtime1.api.SimpleElementMetadata; +import org.apache.drill.common.types.TypeProtos.MinorType; +import org.apache.drill.exec.record.metadata.MapBuilder; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Stack; + +/** + * This class transforms a DFDL/Daffodil schema into a Drill Schema. + */ +public class DrillDaffodilSchemaVisitor extends MetadataHandler { + private static final Logger logger = LoggerFactory.getLogger(DrillDaffodilSchemaVisitor.class); + /** + * Unfortunately, SchemaBuilder and MapBuilder, while similar, do not share a base class so we + * have a stack of MapBuilders, and when empty we use the SchemaBuilder Review Comment: Note that this awkwardness effectively doubles the code size of things that interface to Drill. This duplication of similar behavior for schema and map builders (and rowWriters and mapWriters) is expected and typical of systems that start from a tabular view of the data world and later add the features needed for hierachical data. Nevertheless it is awkward when one is dealing entirely with hierarchical data. A MetaBuilder that does the map thing if the builder is a map, and the schema thing if the builder is a schema would eliminate this. This could be an interface mixed into both SchemaBuilder and MapBuilder (could also be called MapBuilderLike). The same discontinuity at the base holds for RowWriter vs. MapWriter in the runtime handling of data. Again it doubles the code size/complexity, every fix goes in 2 places, etc. A MapWriterLike interface could be factored out. Maybe we should build such mechanisms to avoid this, and then use them to improve this Daffodil plugin? ## contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java: ## @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.daffodil.schema; + +import org.apache.daffodil.japi.InvalidParserException; +import org.apache.daffodil.japi.DataProcessor; +import org.apache.daffodil.runtime1.api.PrimitiveType; +import org.apache.drill.common.types.TypeProtos.MinorType; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import com.google.common.annotations.VisibleForTesting; +import
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803592#comment-17803592 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on PR #2836: URL: https://github.com/apache/drill/pull/2836#issuecomment-1878896878 > @mbeckerle I had a thought about your TODO list. See inline. > > > This is ready for a next review. All the scalar types are now implemented with typed setter calls. > > The prior review comments have all been addressed I believe. > > Remaining things to do include: > > > > 1. How to get the compiled DFDL schema object so it can be loaded by daffodil out at the distributed Drill nodes. > > I was thinking about this and I remembered something that might be useful. Drill has support for User Defined Functions (UDF) which are written in Java. To add a UDF to Drill, you also have to write some Java classes in a particular way, and include the JARs. Much like the DFDL class files, the UDF JARs must be accessible to all nodes of a Drill cluster. > > Additionally, Drill has the capability of adding UDFs dynamically. This feature was added here: #574. Anyway, I wonder if we could use a similar mechanism to load and store the DFDL files so that they are accessible to all Drill nodes. What do you think? Excellent: So drill has all the machinery, it's just a question of repackaging it so it's available for this usage pattern, which is a bit different from Drill's UDFs, but also very similar. There are two user scenarios which we can call production and test. 1. Production: binary compiled DFDL schema file + code jars for Daffodil's own UDFs and "layers" plugins. This should, ideally, cache the compiled schema and not reload it for every query (at every node), but keep the same loaded instance in memory in a persistant JVM image on each node. For large production DFDL schemas this is the only sensible mechanism as it can take minutes to compile large DFDL schemas. 2. Test: on-the-fly centralized compilation of DFDL schema (from a combination of jars and files) to create and cache (to avoid recompiling) the binary compiled DFDL schema file. Then using that compiled binary file, as item 1. For small DFDL schemas this can be fast enough for production use. Ideally, if the DFDL schema is unchanged this would reuse the compiled binary file, but that's an optimization that may not matter much. Kinds of objects involved are: - Daffodil plugin code jars - DFDL schema jars - DFDL schema files (just not packaged into a jar) - Daffodil compiled schema binary file - Daffodil config file - parameters, tunables, and options needed at compile time and/or runtime Code jars: Daffodil provides two extension features for DFDL users - DFDL UDFs and DFDL 'layers' (ex: plug-ins for uudecode, or gunzip algorithms used in part of the data format). Those are ordinary compiled class files in jars, so in all scenarios those jars are needed on the node class path if the DFDL schema uses them. Daffodil dynamically finds and loads these from the classpath in regular Java Service-Provider Interface (SPI) mechanisms. Schema jars: Daffodil packages DFDL schema files (source files i.e., mySchema.dfdl.xsd) into jar files to allow inter-schema dependencies to be managed using ordinary jar/java-style managed dependencies. Tools like sbt and maven can express the dependencies of one schema on another, grab and pull them together, etc. Daffodil has a resolver so when one schema file referenes another with include/import it searches the class path directories and jars for the files. Schema jars are only needed centrally when compiling the schema to a binary file. All references to the jar files for inter-schema file references are compiled into the compiled binary file. It is possible for one DFDL schema 'project' to define a DFDL schema, along with the code for a plugin like a Daffodil UDF or layer. In that case the one jar created is both a code jar and a schema jar. The schema jar aspects are used when the schema is compiled and ignored at Daffodil runtime. The code jar aspects are used at Daffodil run time and ignored at schema compilation time. So such a jar that is both code and schema jar needs to be on the class path in both places, but there's no interaction of the two things. Binary Compiled Schema File: Centrally, DFDL schemas in files and/or jars are compiled to create a single binary object which can be reloaded in order to actually use the schema to parse/unparse data. - These binary files are tied to a specific version+build of Daffodil. (They are just a java object serialization of the runtime data structures used by Daffodil). - Once reloaded into a JVM to create a Daffodil DataProcessor object, that object is
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803367#comment-17803367 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on PR #2836: URL: https://github.com/apache/drill/pull/2836#issuecomment-1877962587 @mbeckerle I had a thought about your TODO list. See inline. > This is ready for a next review. All the scalar types are now implemented with typed setter calls. > > The prior review comments have all been addressed I believe. > > Remaining things to do include: > > 1. How to get the compiled DFDL schema object so it can be loaded by daffodil out at the distributed Drill nodes. I was thinking about this and I remembered something that might be useful. Drill has support for User Defined Functions (UDF) which are written in Java. To add a UDF to Drill, you also have to write some Java classes in a particular way, and include the JARs. Much like the DFDL class files, the UDF JARs must be accessible to all nodes of a Drill cluster. Additionally, Drill has the capability of adding UDFs dynamically. This feature was added here: https://github.com/apache/drill/pull/574. Anyway, I wonder if we could use a similar mechanism to load and store the DFDL files so that they are accessible to all Drill nodes. What do you think? > 2. Test of nilled values (and more tests generally to show deeply nested and repeating nested objects work.) > 3. Errors - revisit every place errors are detected or thrown to make sure these are being done the right way for DFDL schema compilation and runtime errors as well. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803353#comment-17803353 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on PR #2836: URL: https://github.com/apache/drill/pull/2836#issuecomment-1877915796 This is ready for a next review. All the scalar types are now implemented with typed setter calls. The prior review comments have all been addressed I believe. Remaining things to do include: 1. How to get the compiled DFDL schema object so it can be loaded by daffodil out at the distributed Drill nodes. 2. Test of nilled values (and more tests generally to show deeply nested and repeating nested objects work.) 3. Errors - revisit every place errors are detected or thrown to make sure these are being done the right way for DFDL schema compilation and runtime errors as well. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803351#comment-17803351 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on code in PR #2836: URL: https://github.com/apache/drill/pull/2836#discussion_r1442338979 ## contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java: ## @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.daffodil; + +import org.apache.daffodil.runtime1.api.ComplexElementMetadata; +import org.apache.daffodil.runtime1.api.ElementMetadata; +import org.apache.daffodil.runtime1.api.InfosetArray; +import org.apache.daffodil.runtime1.api.InfosetComplexElement; +import org.apache.daffodil.japi.infoset.InfosetOutputter; +import org.apache.daffodil.runtime1.api.InfosetSimpleElement; +import org.apache.daffodil.runtime1.api.PrimitiveType; +import org.apache.drill.common.types.TypeProtos; +import org.apache.drill.exec.physical.resultSet.RowSetLoader; +import org.apache.drill.exec.record.metadata.ColumnMetadata; +import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils; +import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaVisitor; +import org.apache.drill.exec.vector.accessor.ArrayWriter; +import org.apache.drill.exec.vector.accessor.ColumnWriter; +import org.apache.drill.exec.vector.accessor.ObjectType; +import org.apache.drill.exec.vector.accessor.TupleWriter; +import org.apache.drill.exec.vector.complex.writer.BaseWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Stack; + +/** + * Adapts Daffodil parser infoset event calls to Drill writer calls + * to fill in Drill data rows. + */ +public class DaffodilDrillInfosetOutputter +extends InfosetOutputter { + + private boolean isOriginalRoot() { +boolean result = currentTupleWriter() == rowSetWriter; Review Comment: Next commit will have files reformatted based on the eclipse settings in the dev-support/formatter directory, as implemented by intelliJ IDEA when those settings were imported. > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803337#comment-17803337 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on PR #2836: URL: https://github.com/apache/drill/pull/2836#issuecomment-1877814024 Let me respond between the paragraphs On Tue, Jan 2, 2024 at 11:49 PM Paul Rogers ***@***.***> wrote: > Hi Mike, > > Just jumping in with a random thought. Drill has accumulated a number of > schema systems: Parquet metadata cache, HMS, Drill's own metastore, > "provided schema", and now DFDL. All provide ways of defining data: be it > Parquet, JSON, CSV or whatever. One can't help but wonder, should some > future version try to reduce this variation somewhat? Maybe map all the > variations to DFDL? Map DFDL to Drill's own mechanisms? > > Well we can dream can't we :-) I can contribute the ideas in https://daffodil.apache.org/dev/design-notes/Proposed-DFDL-Standard-Profile.md which is an effort to restrict the DFDL language so that schemas written in DFDL can work more smoothly with Drill, NiFi, Spark, Flink, Beam, etc. etc. DFDL's data model is too restrictive to be "the model" for Drill since Drill wants to query even unstructured data like XML without schema. DFDL's data model is targeted only at structured data. Drill's data model and APIs seem optimized for streaming block-buffered top-level rows of data (the EVF API does anyway). Top level row-sets are first-class citizens, as are the fields of said rows. Fields containing arrays of maps (possibly containing more arrays of maps, and so on deeply nested) are not handled uniformly with the same block-buffered "row-like" mechanisms. The APIs are similar, but not polymorphic. I suspect that the block-buffered data streaming in Drill only happens for top-level rows, because there is no test for whether or not you are allowed to create another array item like there is a test for creating another row in a row-set writer. There is no control inversion where an adapter must give back control to Drill in the middle of trying to write an array. The current Drill/Daffodil interface I've created doesn't cope with header-body* files (ex: PCAP which format has a header record, then repeating packet records) as it has no way of returning just the body records as top level rows. So while there exists a DFDL schema for PCAP, you really do want to use a dedicated PCAP Drill adapter which hands back rows, not Daffodil which will parse the entire PCAP file into one huge row containing a monster sub-array of packets, where each packet is a map within the array of maps. This is ok for now as many files where DFDL is used are not like PCAP. They are just repeating records of one format with no special whole-file header. Eventually we will want to be able to supply a path to tell the Drill/Daffodil interface that you only want the packet array as the output rows. (This is the unimplemented Daffodil "onPath(...)" API feature. We haven't needed this yet for DFDL work in cybersecurity, but it was anticipated 10+ years back as essential for data integration.) > Drill uses two kinds of metadata: schema definitions and file metadata > used > for scan pruning. Schema information could be used at plan time (to > provide > column types), but certainly at scan time (to "discover" the defined > schema.) File metadata is used primarily at plan time to work out how to > distribute work. DFDL has zero notion of file metadata. It doesn't know whether data even comes from a file or an open TCP socket. Daffodil/DFDL just sees a java.io.InputStream. The schema it uses for a given file is specified by the API call. Daffodil does nothing itself to try to find or identify any schema. So we're "blank slate" on this issue with DFDL. > > > A bit of background on scan pruning. Back in the day, it was common to > have > thousands or millions of files in Hadoop to scan: this was why tools like > Drill were distributed: divide and conquer. And, of course, the fastest > scan is to skip files that we know can't contain the information we want. > File metadata captures this information outside of the files themselves. > HMS was the standard solution in the Hadoop days. (Amazon Glue, for S3, is > evidently based on HMS.) > > For example, Drill's Parquet metadata cache, the Drill metastore and HMS > all provide both schema and file metadata information. The schema > information mainly helped with schema evolution: over time, different > files > have different sets of columns. File metadata provides information *about* > the file, such as the data ranges stored in each file. For Parquet,
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803334#comment-17803334 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on code in PR #2836: URL: https://github.com/apache/drill/pull/2836#discussion_r1442278098 ## contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java: ## @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.daffodil; + +import org.apache.daffodil.runtime1.api.ComplexElementMetadata; +import org.apache.daffodil.runtime1.api.ElementMetadata; +import org.apache.daffodil.runtime1.api.InfosetArray; +import org.apache.daffodil.runtime1.api.InfosetComplexElement; +import org.apache.daffodil.japi.infoset.InfosetOutputter; +import org.apache.daffodil.runtime1.api.InfosetSimpleElement; +import org.apache.daffodil.runtime1.api.PrimitiveType; +import org.apache.drill.common.types.TypeProtos; +import org.apache.drill.exec.physical.resultSet.RowSetLoader; +import org.apache.drill.exec.record.metadata.ColumnMetadata; +import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils; +import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaVisitor; +import org.apache.drill.exec.vector.accessor.ArrayWriter; +import org.apache.drill.exec.vector.accessor.ColumnWriter; +import org.apache.drill.exec.vector.accessor.ObjectType; +import org.apache.drill.exec.vector.accessor.TupleWriter; +import org.apache.drill.exec.vector.complex.writer.BaseWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Stack; + +/** + * Adapts Daffodil parser infoset event calls to Drill writer calls + * to fill in Drill data rows. + */ +public class DaffodilDrillInfosetOutputter +extends InfosetOutputter { + + private boolean isOriginalRoot() { +boolean result = currentTupleWriter() == rowSetWriter; +if (result) + assert(tupleWriterStack.size() == 1); +return result; + } + + /** + * True if the next startComplex call will be for the + * DFDL infoset root element whose children are the columns of + * the row set. + */ + private boolean isRootElement = true; + + /** + * Stack that is used only if we have sub-structures that are not + * simple-type fields of the row. + */ + private final Stack tupleWriterStack = new Stack<>(); + + private final Stack arrayWriterStack = new Stack<>(); + + private TupleWriter currentTupleWriter() { +return tupleWriterStack.peek(); + } + + private ArrayWriter currentArrayWriter() { +return arrayWriterStack.peek(); + } + + + private static final Logger logger = LoggerFactory.getLogger(DaffodilDrillInfosetOutputter.class); + + private DaffodilDrillInfosetOutputter() {} // no default constructor + + private RowSetLoader rowSetWriter; + + public DaffodilDrillInfosetOutputter(RowSetLoader writer) { +this.rowSetWriter = writer; +this.tupleWriterStack.push(writer); + } + + @Override + public void reset() { +tupleWriterStack.clear(); +tupleWriterStack.push(rowSetWriter); +arrayWriterStack.clear(); +this.isRootElement = true; +checkCleanState(); + } + + private void checkCleanState() { +assert(isOriginalRoot()); +assert(arrayWriterStack.isEmpty()); +assert(isRootElement); + } + + @Override + public void startDocument() { +checkCleanState(); + } + + @Override + public void endDocument() { +checkCleanState(); + } + + private String colName(ElementMetadata md) { +return DrillDaffodilSchemaVisitor.makeColumnName(md); + } + + @Override + public void startSimple(InfosetSimpleElement ise) { +assert (!isRootElement); +ElementMetadata md = ise.metadata(); +String colName = colName(md); +ColumnWriter cw; +if (md.isArray()) { + // A simple type array + assert(!arrayWriterStack.isEmpty()); + cw = currentArrayWriter().scalar(); +} else { + // A simple element within a map + // Note the map itself might be an array + // but we don't care about that here. + cw =
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803249#comment-17803249 ] ASF GitHub Bot commented on DRILL-8474: --- mbeckerle commented on code in PR #2836: URL: https://github.com/apache/drill/pull/2836#discussion_r1442045159 ## contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java: ## @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.daffodil; + +import org.apache.daffodil.runtime1.api.ComplexElementMetadata; +import org.apache.daffodil.runtime1.api.ElementMetadata; +import org.apache.daffodil.runtime1.api.InfosetArray; +import org.apache.daffodil.runtime1.api.InfosetComplexElement; +import org.apache.daffodil.japi.infoset.InfosetOutputter; +import org.apache.daffodil.runtime1.api.InfosetSimpleElement; +import org.apache.daffodil.runtime1.api.PrimitiveType; +import org.apache.drill.common.types.TypeProtos; +import org.apache.drill.exec.physical.resultSet.RowSetLoader; +import org.apache.drill.exec.record.metadata.ColumnMetadata; +import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils; +import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaVisitor; +import org.apache.drill.exec.vector.accessor.ArrayWriter; +import org.apache.drill.exec.vector.accessor.ColumnWriter; +import org.apache.drill.exec.vector.accessor.ObjectType; +import org.apache.drill.exec.vector.accessor.TupleWriter; +import org.apache.drill.exec.vector.complex.writer.BaseWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Stack; + +/** + * Adapts Daffodil parser infoset event calls to Drill writer calls + * to fill in Drill data rows. + */ +public class DaffodilDrillInfosetOutputter +extends InfosetOutputter { + + private boolean isOriginalRoot() { +boolean result = currentTupleWriter() == rowSetWriter; +if (result) + assert(tupleWriterStack.size() == 1); +return result; + } + + /** + * True if the next startComplex call will be for the + * DFDL infoset root element whose children are the columns of + * the row set. + */ + private boolean isRootElement = true; + + /** + * Stack that is used only if we have sub-structures that are not + * simple-type fields of the row. + */ + private final Stack tupleWriterStack = new Stack<>(); + + private final Stack arrayWriterStack = new Stack<>(); + + private TupleWriter currentTupleWriter() { +return tupleWriterStack.peek(); + } + + private ArrayWriter currentArrayWriter() { +return arrayWriterStack.peek(); + } + + + private static final Logger logger = LoggerFactory.getLogger(DaffodilDrillInfosetOutputter.class); + + private DaffodilDrillInfosetOutputter() {} // no default constructor + + private RowSetLoader rowSetWriter; + + public DaffodilDrillInfosetOutputter(RowSetLoader writer) { +this.rowSetWriter = writer; +this.tupleWriterStack.push(writer); + } + + @Override + public void reset() { +tupleWriterStack.clear(); +tupleWriterStack.push(rowSetWriter); +arrayWriterStack.clear(); +this.isRootElement = true; +checkCleanState(); + } + + private void checkCleanState() { +assert(isOriginalRoot()); +assert(arrayWriterStack.isEmpty()); +assert(isRootElement); + } + + @Override + public void startDocument() { +checkCleanState(); + } + + @Override + public void endDocument() { +checkCleanState(); + } + + private String colName(ElementMetadata md) { +return DrillDaffodilSchemaVisitor.makeColumnName(md); + } + + @Override + public void startSimple(InfosetSimpleElement ise) { +assert (!isRootElement); +ElementMetadata md = ise.metadata(); +String colName = colName(md); +ColumnWriter cw; +if (md.isArray()) { + // A simple type array + assert(!arrayWriterStack.isEmpty()); + cw = currentArrayWriter().scalar(); +} else { + // A simple element within a map + // Note the map itself might be an array + // but we don't care about that here. + cw =
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803180#comment-17803180 ] ASF GitHub Bot commented on DRILL-8474: --- cgivre commented on code in PR #2836: URL: https://github.com/apache/drill/pull/2836#discussion_r1441799950 ## contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java: ## @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.daffodil; + +import org.apache.daffodil.runtime1.api.ComplexElementMetadata; +import org.apache.daffodil.runtime1.api.ElementMetadata; +import org.apache.daffodil.runtime1.api.InfosetArray; +import org.apache.daffodil.runtime1.api.InfosetComplexElement; +import org.apache.daffodil.japi.infoset.InfosetOutputter; +import org.apache.daffodil.runtime1.api.InfosetSimpleElement; +import org.apache.daffodil.runtime1.api.PrimitiveType; +import org.apache.drill.common.types.TypeProtos; +import org.apache.drill.exec.physical.resultSet.RowSetLoader; +import org.apache.drill.exec.record.metadata.ColumnMetadata; +import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils; +import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaVisitor; +import org.apache.drill.exec.vector.accessor.ArrayWriter; +import org.apache.drill.exec.vector.accessor.ColumnWriter; +import org.apache.drill.exec.vector.accessor.ObjectType; +import org.apache.drill.exec.vector.accessor.TupleWriter; +import org.apache.drill.exec.vector.complex.writer.BaseWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Stack; + +/** + * Adapts Daffodil parser infoset event calls to Drill writer calls + * to fill in Drill data rows. + */ +public class DaffodilDrillInfosetOutputter +extends InfosetOutputter { + + private boolean isOriginalRoot() { +boolean result = currentTupleWriter() == rowSetWriter; +if (result) + assert(tupleWriterStack.size() == 1); +return result; + } + + /** + * True if the next startComplex call will be for the + * DFDL infoset root element whose children are the columns of + * the row set. + */ + private boolean isRootElement = true; + + /** + * Stack that is used only if we have sub-structures that are not + * simple-type fields of the row. + */ + private final Stack tupleWriterStack = new Stack<>(); + + private final Stack arrayWriterStack = new Stack<>(); + + private TupleWriter currentTupleWriter() { +return tupleWriterStack.peek(); + } + + private ArrayWriter currentArrayWriter() { +return arrayWriterStack.peek(); + } + + + private static final Logger logger = LoggerFactory.getLogger(DaffodilDrillInfosetOutputter.class); + + private DaffodilDrillInfosetOutputter() {} // no default constructor + + private RowSetLoader rowSetWriter; + + public DaffodilDrillInfosetOutputter(RowSetLoader writer) { +this.rowSetWriter = writer; +this.tupleWriterStack.push(writer); + } + + @Override + public void reset() { +tupleWriterStack.clear(); +tupleWriterStack.push(rowSetWriter); +arrayWriterStack.clear(); +this.isRootElement = true; +checkCleanState(); + } + + private void checkCleanState() { +assert(isOriginalRoot()); +assert(arrayWriterStack.isEmpty()); +assert(isRootElement); + } + + @Override + public void startDocument() { +checkCleanState(); + } + + @Override + public void endDocument() { +checkCleanState(); + } + + private String colName(ElementMetadata md) { +return DrillDaffodilSchemaVisitor.makeColumnName(md); + } + + @Override + public void startSimple(InfosetSimpleElement ise) { +assert (!isRootElement); +ElementMetadata md = ise.metadata(); +String colName = colName(md); +ColumnWriter cw; +if (md.isArray()) { + // A simple type array + assert(!arrayWriterStack.isEmpty()); + cw = currentArrayWriter().scalar(); +} else { + // A simple element within a map + // Note the map itself might be an array + // but we don't care about that here. + cw =
[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin
[ https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17802191#comment-17802191 ] Charles Givre commented on DRILL-8474: -- [https://github.com/apache/drill/pull/2836] > Add Daffodil Format Plugin > -- > > Key: DRILL-8474 > URL: https://issues.apache.org/jira/browse/DRILL-8474 > Project: Apache Drill > Issue Type: New Feature >Affects Versions: 1.21.1 >Reporter: Charles Givre >Priority: Major > Fix For: 1.22.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)