[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-07-05 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17863373#comment-17863373
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on code in PR #2909:
URL: https://github.com/apache/drill/pull/2909#discussion_r1666968774


##
contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java:
##
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.daffodil;
+
+import org.apache.drill.categories.RowSetTest;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.physical.rowSet.RowSet;
+import org.apache.drill.exec.physical.rowSet.RowSetReader;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterTest;
+import org.apache.drill.test.QueryBuilder;
+import org.apache.drill.test.rowSet.RowSetComparison;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.nio.file.Paths;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+@Category(RowSetTest.class)
+public class TestDaffodilReader extends ClusterTest {
+
+  String schemaURIRoot = 
"file:///opt/drill/contrib/format-daffodil/src/test/resources/";

Review Comment:
   What, exactly, do I change this to, if I want to retrieve files from 
$DRILL_CONFIG_DIR/lib ? 



##
contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java:
##
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.daffodil;
+
+import org.apache.drill.categories.RowSetTest;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.physical.rowSet.RowSet;
+import org.apache.drill.exec.physical.rowSet.RowSetReader;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterTest;
+import org.apache.drill.test.QueryBuilder;
+import org.apache.drill.test.rowSet.RowSetComparison;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.nio.file.Paths;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+@Category(RowSetTest.class)
+public class TestDaffodilReader extends ClusterTest {
+
+  String schemaURIRoot = 
"file:///opt/drill/contrib/format-daffodil/src/test/resources/";
+
+  @BeforeClass
+  public static void setup() throws Exception {
+// boilerplate call to start test rig
+ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher));
+
+DaffodilFormatConfig formatConfig = new DaffodilFormatConfig(null, "", "", 
"", false);
+
+cluster.defineFormat("dfs", "daffodil", formatConfig);
+
+// Needed to test against compressed files.
+// Copies data from src/test/resources to the dfs root.
+dirTestWatcher.copyResourceToRoot(Paths.get("data/"));
+dirTestWatcher.copyResourceToRoot(Paths.get("schema/"));
+  }
+
+  private String selectRow(String schema, 

[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-05-09 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17845113#comment-17845113
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on code in PR #2909:
URL: https://github.com/apache/drill/pull/2909#discussion_r1595903385


##
contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java:
##
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.daffodil.schema;
+
+import org.apache.daffodil.japi.Compiler;
+import org.apache.daffodil.japi.Daffodil;
+import org.apache.daffodil.japi.DataProcessor;
+import org.apache.daffodil.japi.Diagnostic;
+import org.apache.daffodil.japi.InvalidParserException;
+import org.apache.daffodil.japi.InvalidUsageException;
+import org.apache.daffodil.japi.ProcessorFactory;
+import org.apache.daffodil.japi.ValidationMode;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.channels.Channels;
+import java.util.List;
+import java.util.Objects;
+
+/**
+ * Compiles a DFDL schema (mostly for tests) or loads a pre-compiled DFDL 
schema so that one can
+ * obtain a DataProcessor for use with DaffodilMessageParser.
+ * 
+ * TODO: Needs to use a cache to avoid reloading/recompiling every time.
+ */
+public class DaffodilDataProcessorFactory {
+  // Default constructor is used.
+
+  private static final Logger logger = 
LoggerFactory.getLogger(DaffodilDataProcessorFactory.class);
+
+  private DataProcessor dp;
+
+  /**
+   * Gets a Daffodil DataProcessor given the necessary arguments to compile or 
reload it.
+   *
+   * @param schemaFileURI
+   * pre-compiled dfdl schema (.bin extension) or DFDL schema source (.xsd 
extension)
+   * @param validationMode
+   * Use true to request Daffodil built-in 'limited' validation. Use false 
for no validation.
+   * @param rootName
+   * Local name of root element of the message. Can be null to use the 
first element declaration
+   * of the primary schema file. Ignored if reloading a pre-compiled 
schema.
+   * @param rootNS
+   * Namespace URI as a string. Can be null to use the target namespace of 
the primary schema
+   * file or if it is unambiguous what element is the rootName. Ignored if 
reloading a
+   * pre-compiled schema.
+   * @return the DataProcessor
+   * @throws CompileFailure
+   * - if schema compilation fails
+   */
+  public DataProcessor getDataProcessor(URI schemaFileURI, boolean 
validationMode, String rootName,
+  String rootNS)
+  throws CompileFailure {
+
+DaffodilDataProcessorFactory dmp = new DaffodilDataProcessorFactory();
+boolean isPrecompiled = schemaFileURI.toString().endsWith(".bin");
+if (isPrecompiled) {
+  if (Objects.nonNull(rootName) && !rootName.isEmpty()) {
+// A usage error. You shouldn't supply the name and optionally 
namespace if loading
+// precompiled schema because those are built into it. Should be null 
or "".
+logger.warn("Root element name '{}' is ignored when used with 
precompiled DFDL schema.",
+rootName);
+  }
+  try {
+dmp.loadSchema(schemaFileURI);
+  } catch (IOException | InvalidParserException e) {
+throw new CompileFailure(e);
+  }
+  dmp.setupDP(validationMode, null);
+} else {
+  List pfDiags;
+  try {
+pfDiags = dmp.compileSchema(schemaFileURI, rootName, rootNS);
+  } catch (URISyntaxException | IOException e) {
+throw new CompileFailure(e);
+  }
+  dmp.setupDP(validationMode, pfDiags);
+}
+return dmp.dp;
+  }
+
+  private void loadSchema(URI schemaFileURI) throws IOException, 
InvalidParserException {
+Compiler c = Daffodil.compiler();
+dp = c.reload(Channels.newChannel(schemaFileURI.toURL().openStream()));
+  }
+
+  private List compileSchema(URI schemaFileURI, String rootName, 
String rootNS)
+  throws URISyntaxException, IOException, CompileFailure {
+Compiler c = 

[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-05-06 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17843832#comment-17843832
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on PR #2909:
URL: https://github.com/apache/drill/pull/2909#issuecomment-209976

   > Hi Mike, Are you free at all this week? My apologies... We're in the 
middle of putting an offer on a house and my life is very hectic at the moment. 
Best, 

> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-05-05 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17843601#comment-17843601
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on PR #2909:
URL: https://github.com/apache/drill/pull/2909#issuecomment-2095044801

   Hi Mike, 
   Are you free at all this week?  My apologies... We're in the middle of 
putting an offer on a house and my life is very hectic at the moment.
   Best,
   

> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-04-28 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841807#comment-17841807
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on PR #2909:
URL: https://github.com/apache/drill/pull/2909#issuecomment-2081781546

   Tests are now failing due to these two things in TestDaffodilReader.scala
   ```
 String schemaURIRoot = 
"file:///opt/drill/contrib/format-daffodil/src/test/resources/";
   ```
   That's an absolute URI that is used to obtain access to the schema files in 
this statement:
   ```
 private String selectRow(String schema, String file) {
   return "SELECT * FROM table(dfs.`data/" + file + "` " + " (type => 
'daffodil'," + " " +
   "validationMode => 'true', " + " schemaURI => '" + schemaURIRoot + 
"schema/" + schema +
   ".dfdl.xsd'," + " rootName => 'row'," + " rootNamespace => null " + 
"))";
 }
   ```
   This is assembling a select statement, and puts this absolute schemaURI into 
the schemaURI part of the select. 
   
   What should I be doing to arrange for these schema URIs to be found. 
   
   The schemas are a large complex set of files, not just a single file. Many 
files must be found relative to the initial root schema file. (Hundreds of 
files potentially). As they include/import other schema files using relative 
paths. 
   




> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-04-28 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841775#comment-17841775
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on code in PR #2909:
URL: https://github.com/apache/drill/pull/2909#discussion_r1582375084


##
exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java:
##
@@ -185,6 +192,26 @@ public MapBuilder resumeMap() {
 return (MapBuilder) parent;
   }
 
+  /**
+   * Depending on whether the parent is a schema builder or map builder
+   * we resume appropriately.
+   */
+  @Override
+  public void resume() {
+if (Objects.isNull(parent))

Review Comment:
   I just built Drill using the following command:
   
   ```sh
   mvn clean install -DskipTests
   ```
   When I did that, I was getting the same error as on GitHub.  After adding 
the braces as described above, it built without issues.
   With that said, I think you can do just run the check style with:
   
   ```sh
   mvn checkstyle:checkstyle
   ```





> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-04-28 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841774#comment-17841774
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on code in PR #2909:
URL: https://github.com/apache/drill/pull/2909#discussion_r1582375084


##
exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java:
##
@@ -185,6 +192,26 @@ public MapBuilder resumeMap() {
 return (MapBuilder) parent;
   }
 
+  /**
+   * Depending on whether the parent is a schema builder or map builder
+   * we resume appropriately.
+   */
+  @Override
+  public void resume() {
+if (Objects.isNull(parent))

Review Comment:
   I just built Drill using the following command:
   
   ```sh
   mvn clean install -DskipTests
   ```
   
   I think you can do just run the check style with:
   
   ```sh
   mvn checkstyle:checkstyle
   ```





> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-04-28 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841768#comment-17841768
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on code in PR #2909:
URL: https://github.com/apache/drill/pull/2909#discussion_r1582367382


##
exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java:
##
@@ -185,6 +192,26 @@ public MapBuilder resumeMap() {
 return (MapBuilder) parent;
   }
 
+  /**
+   * Depending on whether the parent is a schema builder or map builder
+   * we resume appropriately.
+   */
+  @Override
+  public void resume() {
+if (Objects.isNull(parent))

Review Comment:
   What is the maven command line to just make it run this checkstyle?





> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-04-28 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841667#comment-17841667
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on code in PR #2909:
URL: https://github.com/apache/drill/pull/2909#discussion_r1582206247


##
exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java:
##
@@ -185,6 +192,26 @@ public MapBuilder resumeMap() {
 return (MapBuilder) parent;
   }
 
+  /**
+   * Depending on whether the parent is a schema builder or map builder
+   * we resume appropriately.
+   */
+  @Override
+  public void resume() {
+if (Objects.isNull(parent))

Review Comment:
   @mbeckerle Confirmed.  I successfully built your branch by adding the 
aforementioned braces.  I'll save you some additional trouble.  There's another 
check style violation in `DaffodilBatchReader`.  Drill doesn't like star 
imports for some reason.





> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-04-28 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841663#comment-17841663
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on code in PR #2909:
URL: https://github.com/apache/drill/pull/2909#discussion_r1582202511


##
exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java:
##
@@ -185,6 +192,26 @@ public MapBuilder resumeMap() {
 return (MapBuilder) parent;
   }
 
+  /**
+   * Depending on whether the parent is a schema builder or map builder
+   * we resume appropriately.
+   */
+  @Override
+  public void resume() {
+if (Objects.isNull(parent))

Review Comment:
   @mbeckerle I don't know why the checkstyle is telling you the wrong file, 
but here, you'll need braces as well as at line 203. 
   
   ie:
   ```java
   if (parent instanceof MapBuilder) {
 resumeMap();
   }
   ```
   





> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-04-28 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841637#comment-17841637
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

shfshihuafeng commented on PR #2909:
URL: https://github.com/apache/drill/pull/2909#issuecomment-2081475418

   > This fails its tests due to a maven checkstyle failure. It's complaining 
about Drill:Exec:Vectors, which my code has no changes to.
   > 
   > Can someone advise on what is wrong here?
   
if (Objects.isNull(parent)) {
   throw new IllegalStateException("Call to resume() on MapBuilder with no 
parent.");
   }




> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-04-28 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841636#comment-17841636
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

shfshihuafeng commented on PR #2909:
URL: https://github.com/apache/drill/pull/2909#issuecomment-2081475241

   > This fails its tests due to a maven checkstyle failure. It's complaining 
about Drill:Exec:Vectors, which my code has no changes to.
   > 
   > Can someone advise on what is wrong here?
   
/home/runner/work/drill/drill/exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java:201:5
   you need add if' construct must use '{}',like following ?
   
if (Objects.isNull(parent)) {
   throw new IllegalStateException("Call to resume() on MapBuilder with no 
parent.");
   }
 
   
   > This fails its tests due to a maven checkstyle failure. It's complaining 
about Drill:Exec:Vectors, which my code has no changes to.
   > 
   > Can someone advise on what is wrong here?
   
   
exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java 
 201 

   i think  you  need add {} for if
   ```
if (Objects.isNull(parent)) {
   throw new IllegalStateException("Call to resume() on MapBuilder with no 
parent.");
   }
   ```




> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-04-27 Thread Mike Beckerle (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841537#comment-17841537
 ] 

Mike Beckerle commented on DRILL-8474:
--

PR for this ticket is now https://github.com/apache/drill/pull/2909

> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-04-27 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841530#comment-17841530
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle closed pull request #2836: DRILL-8474: Add Daffodil Format Plugin
URL: https://github.com/apache/drill/pull/2836




> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-04-27 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841531#comment-17841531
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on PR #2836:
URL: https://github.com/apache/drill/pull/2836#issuecomment-2081176156

   Creating a new squashed PR so as to avoid loss of the comments on this PR. 




> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-04-27 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17841528#comment-17841528
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on PR #2836:
URL: https://github.com/apache/drill/pull/2836#issuecomment-2081164073

   This now passes all the daffodil contrib tests using the published official 
Daffodil 3.7.0.
   
   It does not yet run in any scalable fashion, but the metadata/data 
interfacing is complete. 
   
   I would like to squash this to a single commit before merging, and it needs 
to be tested rebased onto the latest Drill commit. 




> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-23 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17810092#comment-17810092
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on PR #2836:
URL: https://github.com/apache/drill/pull/2836#issuecomment-1906827568

   Ok, so the geo-ip UDF stuff has no special mechanisms or description about 
those resource files, so the generic code that "scans" must find them and drag 
them along automatically. 
   
   That's the behavior I want. 
   
   What is "Drill's 3rd Party Jar folder"? 
   
   If a magic folder just gets dragged over to all nodes, and drill uses a 
class loader that arranges for jars in that folder to be searched, then there 
is very little to do, since a DFDL schema can be just a set of jar files 
containing related resources, and the classes for Daffodil's own UDFs and 
layers which are java code extensions of its own kind. 




> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-23 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17810070#comment-17810070
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on PR #2836:
URL: https://github.com/apache/drill/pull/2836#issuecomment-1906689793

   > > > @cgivre @paul-rogers is there an example of a Drill UDF that is not 
part of the drill repository tree?
   > > > I'd like to understand the mechanisms for distributing any jar files 
and dependencies of the UDF that drill uses. I can't find any such in the 
quasi-USFs that are in the Drill tree, because well, since they are part of 
Drill, and so are their dependencies, this problem doesn't exist.
   > > 
   > > 
   > > @mbeckerle Here's an example: 
https://github.com/datadistillr/drill-humanname-functions. I'm sorry we weren't 
able to connect last week.
   > 
   > If I understand this correctly, if a jar is on the classpath and has 
drill-module.conf in its root dir, then drill will find it and read that HOCON 
file to get the package to add to drill.classpath.scanning.packages.
   
   I believe that is correct.
   
   > 
   > Drill then appears to scan jars for class files for those packages. Not 
sure what it is doing with the class files. I imagine it is repackaging them 
somehow so Drill can use them on the drill distributed nodes. But it isn't yet 
clear to me how this aspect works. Do these classes just get loaded on the 
distributed drill nodes? Or is the classpath augmented in some way on the drill 
nodes so that they see a jar that contains all these classes?
   > 
   > I have two questions:
   > 
   > (1) what about dependencies? The UDF may depend on libraries which depend 
on other libraries, etc.
   
   So UDFs are a bit of a special case, but if they do have dependencies, you 
have to also include those JAR files in the UDF directory, or in Drill's 3rd 
party JAR folder.   I'm not that good with maven, but I've often wondered about 
making a so-called fat-JAR which includes the dependencies as part of the UDF 
JAR file.
   
   > 
   > (2) what about non-class files, e.g., things under src/main/resources of 
the project that go into the jar, but aren't "class" files? How do those things 
also get moved? How would code running in the drill node access these? The 
usual method is to call getResource(URL) with a URL that gives the path within 
a jar file to the resource in question.
   
   Take a look at this UDF. 
https://github.com/datadistillr/drill-geoip-functions
   This UDF has a few external resources including a CSV file and the MaxMind 
databases.
   
   
   > 
   > Thanks for any info.
   
   




> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-23 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17810051#comment-17810051
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on PR #2836:
URL: https://github.com/apache/drill/pull/2836#issuecomment-1906561549

   > > @cgivre @paul-rogers is there an example of a Drill UDF that is not part 
of the drill repository tree?
   > > I'd like to understand the mechanisms for distributing any jar files and 
dependencies of the UDF that drill uses. I can't find any such in the 
quasi-USFs that are in the Drill tree, because well, since they are part of 
Drill, and so are their dependencies, this problem doesn't exist.
   > 
   > @mbeckerle Here's an example: 
https://github.com/datadistillr/drill-humanname-functions. I'm sorry we weren't 
able to connect last week.
   
   If I understand this correctly, if a jar is on the classpath and has 
drill-module.conf in its root dir, then drill will find it and read that HOCON 
file to get the package to add to drill.classpath.scanning.packages. 
   
   Drill then appears to scan jars for class files for those packages. Not sure 
what it is doing with the class files. I imagine it is repackaging them somehow 
so Drill can use them on the drill distributed nodes. But it isn't yet clear to 
me how this aspect works. Do these classes just get loaded on the distributed 
drill nodes? Or is the classpath augmented in some way on the drill nodes so 
that they see a jar that contains all these classes?
   
   I have two questions: 
   
   (1) what about dependencies? The UDF may depend on libraries which depend on 
other libraries, etc. 
   
   (2) what about non-class files, e.g., things under src/main/resources of the 
project that go into the jar, but aren't "class" files? How do those things 
also get moved? How would code running in the drill node access these? The 
usual method is to call getResource(URL) with a URL that gives the path within 
a jar file to the resource in question. 
   
   Thanks for any info. 
   




> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-21 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17809174#comment-17809174
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on PR #2836:
URL: https://github.com/apache/drill/pull/2836#issuecomment-1902751729

   > @cgivre @paul-rogers is there an example of a Drill UDF that is not part 
of the drill repository tree?
   > 
   > I'd like to understand the mechanisms for distributing any jar files and 
dependencies of the UDF that drill uses. I can't find any such in the 
quasi-USFs that are in the Drill tree, because well, since they are part of 
Drill, and so are their dependencies, this problem doesn't exist.
   
   
   @mbeckerle Here's an example: 
https://github.com/datadistillr/drill-humanname-functions.I'm sorry we 
weren't able to connect last week.  
   
   




> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-21 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17809173#comment-17809173
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on PR #2836:
URL: https://github.com/apache/drill/pull/2836#issuecomment-1902750285

   @cgivre @paul-rogers is there an example of a Drill UDF that is not part of 
the drill repository tree? 
   
   I'd like to understand the mechanisms for distributing any jar files and 
dependencies of the UDF that drill uses. I can't find any such in the 
quasi-USFs that are in the Drill tree, because well, since they are part of 
Drill, and so are their dependencies, this problem doesn't exist. 




> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-21 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17809172#comment-17809172
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on code in PR #2836:
URL: https://github.com/apache/drill/pull/2836#discussion_r1461099077


##
contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java:
##
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.daffodil.schema;
+
+import org.apache.daffodil.runtime1.api.ChoiceMetadata;
+import org.apache.daffodil.runtime1.api.ComplexElementMetadata;
+import org.apache.daffodil.runtime1.api.ElementMetadata;
+import org.apache.daffodil.runtime1.api.InfosetSimpleElement;
+import org.apache.daffodil.runtime1.api.MetadataHandler;
+import org.apache.daffodil.runtime1.api.SequenceMetadata;
+import org.apache.daffodil.runtime1.api.SimpleElementMetadata;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.record.metadata.MapBuilder;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Stack;
+
+/**
+ * This class transforms a DFDL/Daffodil schema into a Drill Schema.
+ */
+public class DrillDaffodilSchemaVisitor extends MetadataHandler {
+  private static final Logger logger = 
LoggerFactory.getLogger(DrillDaffodilSchemaVisitor.class);
+  /**
+   * Unfortunately, SchemaBuilder and MapBuilder, while similar, do not share 
a base class so we
+   * have a stack of MapBuilders, and when empty we use the SchemaBuilder

Review Comment:
   This is fixed in the latest commit. Created MapBuilderLike interface shared 
by SchemaBuilder and MapBuilder. I only populated it with the methods I needed. 
   
   The corresponding problem doesn't really occur in the rowWriter area as 
tupleWriter is the common underlying class used. 





> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-16 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17807233#comment-17807233
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on code in PR #2836:
URL: https://github.com/apache/drill/pull/2836#discussion_r1453422371


##
contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java:
##
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.daffodil;
+
+import org.apache.daffodil.japi.DataProcessor;
+import org.apache.drill.common.AutoCloseables;
+import org.apache.drill.common.exceptions.CustomErrorContext;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.physical.impl.scan.v3.ManagedReader;
+import org.apache.drill.exec.physical.impl.scan.v3.file.FileDescrip;
+import org.apache.drill.exec.physical.impl.scan.v3.file.FileSchemaNegotiator;
+import org.apache.drill.exec.physical.resultSet.RowSetLoader;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import 
org.apache.drill.exec.store.daffodil.schema.DaffodilDataProcessorFactory;
+import org.apache.drill.exec.store.dfs.DrillFileSystem;
+import org.apache.drill.exec.store.dfs.easy.EasySubScan;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.Objects;
+
+import static 
org.apache.drill.exec.store.daffodil.schema.DaffodilDataProcessorFactory.*;
+import static 
org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils.daffodilDataProcessorToDrillSchema;
+
+public class DaffodilBatchReader implements ManagedReader {
+
+  private static final Logger logger = 
LoggerFactory.getLogger(DaffodilBatchReader.class);
+  private final RowSetLoader rowSetLoader;
+  private final CustomErrorContext errorContext;
+  private final DaffodilMessageParser dafParser;
+  private final InputStream dataInputStream;
+
+  public DaffodilBatchReader(DaffodilReaderConfig readerConfig, EasySubScan 
scan,
+  FileSchemaNegotiator negotiator) {
+
+errorContext = negotiator.parentErrorContext();
+DaffodilFormatConfig dafConfig = readerConfig.plugin.getConfig();
+
+String schemaURIString = dafConfig.getSchemaURI(); // 
"schema/complexArray1.dfdl.xsd";
+String rootName = dafConfig.getRootName();
+String rootNamespace = dafConfig.getRootNamespace();
+boolean validationMode = dafConfig.getValidationMode();
+
+URI dfdlSchemaURI;
+try {
+  dfdlSchemaURI = new URI(schemaURIString);
+} catch (URISyntaxException e) {
+  throw UserException.validationError(e).build(logger);
+}
+
+FileDescrip file = negotiator.file();
+DrillFileSystem fs = file.fileSystem();
+URI fsSchemaURI = fs.getUri().resolve(dfdlSchemaURI);
+
+DaffodilDataProcessorFactory dpf = new DaffodilDataProcessorFactory();
+DataProcessor dp;
+try {
+  dp = dpf.getDataProcessor(fsSchemaURI, validationMode, rootName, 
rootNamespace);
+} catch (CompileFailure e) {
+  throw UserException.dataReadError(e)
+  .message(String.format("Failed to get Daffodil DFDL processor for: 
%s", fsSchemaURI))
+  .addContext(errorContext).addContext(e.getMessage()).build(logger);
+}
+// Create the corresponding Drill schema.
+// Note: this could be a very large schema. Think of a large complex RDBMS 
schema,
+// all of it, hundreds of tables, but all part of the same metadata tree.
+TupleMetadata drillSchema = daffodilDataProcessorToDrillSchema(dp);
+// Inform Drill about the schema
+negotiator.tableSchema(drillSchema, true);
+
+//
+// DATA TIME: Next we construct the runtime objects, and open files.
+//
+// We get the DaffodilMessageParser, which is a stateful driver for 
daffodil that
+// actually does the parsing.
+rowSetLoader = negotiator.build().writer();
+
+// We construct the Daffodil InfosetOutputter which the daffodil parser 
uses to
+// convert infoset event 

[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-14 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17806487#comment-17806487
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on PR #2836:
URL: https://github.com/apache/drill/pull/2836#issuecomment-1890990577

   > > @mbeckerle With respect to style, I tried to reply to that comment, but 
the thread won't let me. In any event, Drill classes will typically start with 
the constructor, then have whatever methods are appropriate for the class. The 
logger creation usually happens before the constructor. I think all of your 
other classes followed this format, so the one or two that didn't kind of 
jumped out at me.
   > 
   > @cgivre I believe the style issues are all fixed. The build did not get 
any codestyle issues.
   
   The issue I was referring to was more around the organization of a few 
classes.  Usually we'll have the constructor (if present) at the top followed 
by any class methods.  I think there was a class or two where the constructor 
was at the bottom or something like that.  In any event, consider the issue 
resolved.




> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-14 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17806486#comment-17806486
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on code in PR #2836:
URL: https://github.com/apache/drill/pull/2836#discussion_r1451758017


##
contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java:
##
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.daffodil;
+
+import org.apache.daffodil.japi.DataProcessor;
+import org.apache.drill.common.AutoCloseables;
+import org.apache.drill.common.exceptions.CustomErrorContext;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.physical.impl.scan.v3.ManagedReader;
+import org.apache.drill.exec.physical.impl.scan.v3.file.FileDescrip;
+import org.apache.drill.exec.physical.impl.scan.v3.file.FileSchemaNegotiator;
+import org.apache.drill.exec.physical.resultSet.RowSetLoader;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import 
org.apache.drill.exec.store.daffodil.schema.DaffodilDataProcessorFactory;
+import org.apache.drill.exec.store.dfs.DrillFileSystem;
+import org.apache.drill.exec.store.dfs.easy.EasySubScan;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.Objects;
+
+import static 
org.apache.drill.exec.store.daffodil.schema.DaffodilDataProcessorFactory.*;
+import static 
org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils.daffodilDataProcessorToDrillSchema;
+
+public class DaffodilBatchReader implements ManagedReader {
+
+  private static final Logger logger = 
LoggerFactory.getLogger(DaffodilBatchReader.class);
+  private final RowSetLoader rowSetLoader;
+  private final CustomErrorContext errorContext;
+  private final DaffodilMessageParser dafParser;
+  private final InputStream dataInputStream;
+
+  public DaffodilBatchReader(DaffodilReaderConfig readerConfig, EasySubScan 
scan,
+  FileSchemaNegotiator negotiator) {
+
+errorContext = negotiator.parentErrorContext();
+DaffodilFormatConfig dafConfig = readerConfig.plugin.getConfig();
+
+String schemaURIString = dafConfig.getSchemaURI(); // 
"schema/complexArray1.dfdl.xsd";
+String rootName = dafConfig.getRootName();
+String rootNamespace = dafConfig.getRootNamespace();
+boolean validationMode = dafConfig.getValidationMode();
+
+URI dfdlSchemaURI;
+try {
+  dfdlSchemaURI = new URI(schemaURIString);
+} catch (URISyntaxException e) {
+  throw UserException.validationError(e).build(logger);
+}
+
+FileDescrip file = negotiator.file();
+DrillFileSystem fs = file.fileSystem();
+URI fsSchemaURI = fs.getUri().resolve(dfdlSchemaURI);
+
+DaffodilDataProcessorFactory dpf = new DaffodilDataProcessorFactory();
+DataProcessor dp;
+try {
+  dp = dpf.getDataProcessor(fsSchemaURI, validationMode, rootName, 
rootNamespace);
+} catch (CompileFailure e) {
+  throw UserException.dataReadError(e)
+  .message(String.format("Failed to get Daffodil DFDL processor for: 
%s", fsSchemaURI))
+  .addContext(errorContext).addContext(e.getMessage()).build(logger);
+}
+// Create the corresponding Drill schema.
+// Note: this could be a very large schema. Think of a large complex RDBMS 
schema,
+// all of it, hundreds of tables, but all part of the same metadata tree.
+TupleMetadata drillSchema = daffodilDataProcessorToDrillSchema(dp);
+// Inform Drill about the schema
+negotiator.tableSchema(drillSchema, true);
+
+//
+// DATA TIME: Next we construct the runtime objects, and open files.
+//
+// We get the DaffodilMessageParser, which is a stateful driver for 
daffodil that
+// actually does the parsing.
+rowSetLoader = negotiator.build().writer();
+
+// We construct the Daffodil InfosetOutputter which the daffodil parser 
uses to
+// convert infoset event 

[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-14 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17806484#comment-17806484
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on code in PR #2836:
URL: https://github.com/apache/drill/pull/2836#discussion_r1451757410


##
contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java:
##
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.daffodil.schema;
+
+import org.apache.daffodil.runtime1.api.ChoiceMetadata;
+import org.apache.daffodil.runtime1.api.ComplexElementMetadata;
+import org.apache.daffodil.runtime1.api.ElementMetadata;
+import org.apache.daffodil.runtime1.api.InfosetSimpleElement;
+import org.apache.daffodil.runtime1.api.MetadataHandler;
+import org.apache.daffodil.runtime1.api.SequenceMetadata;
+import org.apache.daffodil.runtime1.api.SimpleElementMetadata;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.record.metadata.MapBuilder;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Stack;
+
+/**
+ * This class transforms a DFDL/Daffodil schema into a Drill Schema.
+ */
+public class DrillDaffodilSchemaVisitor extends MetadataHandler {
+  private static final Logger logger = 
LoggerFactory.getLogger(DrillDaffodilSchemaVisitor.class);
+  /**
+   * Unfortunately, SchemaBuilder and MapBuilder, while similar, do not share 
a base class so we
+   * have a stack of MapBuilders, and when empty we use the SchemaBuilder

Review Comment:
   This is likely music to @paul-rogers's ears.





> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-14 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17806482#comment-17806482
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on code in PR #2836:
URL: https://github.com/apache/drill/pull/2836#discussion_r1451756763


##
contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java:
##
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.daffodil.schema;
+
+import org.apache.daffodil.japi.Compiler;
+import org.apache.daffodil.japi.Daffodil;
+import org.apache.daffodil.japi.DataProcessor;
+import org.apache.daffodil.japi.Diagnostic;
+import org.apache.daffodil.japi.InvalidParserException;
+import org.apache.daffodil.japi.InvalidUsageException;
+import org.apache.daffodil.japi.ProcessorFactory;
+import org.apache.daffodil.japi.ValidationMode;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.channels.Channels;
+import java.util.List;
+import java.util.Objects;
+
+/**
+ * Compiles a DFDL schema (mostly for tests) or loads a pre-compiled DFDL 
schema so that one can
+ * obtain a DataProcessor for use with DaffodilMessageParser.
+ * 
+ * TODO: Needs to use a cache to avoid reloading/recompiling every time.
+ */
+public class DaffodilDataProcessorFactory {
+  // Default constructor is used.
+
+  private static final Logger logger = 
LoggerFactory.getLogger(DaffodilDataProcessorFactory.class);
+
+  private DataProcessor dp;
+
+  /**
+   * Gets a Daffodil DataProcessor given the necessary arguments to compile or 
reload it.
+   *
+   * @param schemaFileURI
+   * pre-compiled dfdl schema (.bin extension) or DFDL schema source (.xsd 
extension)
+   * @param validationMode
+   * Use true to request Daffodil built-in 'limited' validation. Use false 
for no validation.
+   * @param rootName
+   * Local name of root element of the message. Can be null to use the 
first element declaration
+   * of the primary schema file. Ignored if reloading a pre-compiled 
schema.
+   * @param rootNS
+   * Namespace URI as a string. Can be null to use the target namespace of 
the primary schema
+   * file or if it is unambiguous what element is the rootName. Ignored if 
reloading a
+   * pre-compiled schema.
+   * @return the DataProcessor
+   * @throws CompileFailure
+   * - if schema compilation fails
+   */
+  public DataProcessor getDataProcessor(URI schemaFileURI, boolean 
validationMode, String rootName,
+  String rootNS)
+  throws CompileFailure {
+
+DaffodilDataProcessorFactory dmp = new DaffodilDataProcessorFactory();
+boolean isPrecompiled = schemaFileURI.toString().endsWith(".bin");
+if (isPrecompiled) {
+  if (Objects.nonNull(rootName) && !rootName.isEmpty()) {
+// A usage error. You shouldn't supply the name and optionally 
namespace if loading
+// precompiled schema because those are built into it. Should be null 
or "".
+logger.warn("Root element name '{}' is ignored when used with 
precompiled DFDL schema.",
+rootName);
+  }
+  try {
+dmp.loadSchema(schemaFileURI);
+  } catch (IOException | InvalidParserException e) {
+throw new CompileFailure(e);
+  }
+  dmp.setupDP(validationMode, null);
+} else {
+  List pfDiags;
+  try {
+pfDiags = dmp.compileSchema(schemaFileURI, rootName, rootNS);
+  } catch (URISyntaxException | IOException e) {
+throw new CompileFailure(e);
+  }
+  dmp.setupDP(validationMode, pfDiags);
+}
+return dmp.dp;
+  }
+
+  private void loadSchema(URI schemaFileURI) throws IOException, 
InvalidParserException {
+Compiler c = Daffodil.compiler();
+dp = c.reload(Channels.newChannel(schemaFileURI.toURL().openStream()));

Review Comment:
   This definitely seems like an area where there is potential for a lot of 
different things to go wrong.  My view is we should just do our best to provide 
clear error 

[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-14 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17806481#comment-17806481
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on code in PR #2836:
URL: https://github.com/apache/drill/pull/2836#discussion_r1451756527


##
contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java:
##
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.daffodil.schema;
+
+import org.apache.daffodil.japi.Compiler;
+import org.apache.daffodil.japi.Daffodil;
+import org.apache.daffodil.japi.DataProcessor;
+import org.apache.daffodil.japi.Diagnostic;
+import org.apache.daffodil.japi.InvalidParserException;
+import org.apache.daffodil.japi.InvalidUsageException;
+import org.apache.daffodil.japi.ProcessorFactory;
+import org.apache.daffodil.japi.ValidationMode;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.channels.Channels;
+import java.util.List;
+import java.util.Objects;
+
+/**
+ * Compiles a DFDL schema (mostly for tests) or loads a pre-compiled DFDL 
schema so that one can
+ * obtain a DataProcessor for use with DaffodilMessageParser.
+ * 
+ * TODO: Needs to use a cache to avoid reloading/recompiling every time.
+ */
+public class DaffodilDataProcessorFactory {
+  // Default constructor is used.
+
+  private static final Logger logger = 
LoggerFactory.getLogger(DaffodilDataProcessorFactory.class);
+
+  private DataProcessor dp;
+
+  /**
+   * Gets a Daffodil DataProcessor given the necessary arguments to compile or 
reload it.
+   *
+   * @param schemaFileURI
+   * pre-compiled dfdl schema (.bin extension) or DFDL schema source (.xsd 
extension)
+   * @param validationMode
+   * Use true to request Daffodil built-in 'limited' validation. Use false 
for no validation.
+   * @param rootName
+   * Local name of root element of the message. Can be null to use the 
first element declaration
+   * of the primary schema file. Ignored if reloading a pre-compiled 
schema.
+   * @param rootNS
+   * Namespace URI as a string. Can be null to use the target namespace of 
the primary schema
+   * file or if it is unambiguous what element is the rootName. Ignored if 
reloading a
+   * pre-compiled schema.
+   * @return the DataProcessor
+   * @throws CompileFailure
+   * - if schema compilation fails
+   */
+  public DataProcessor getDataProcessor(URI schemaFileURI, boolean 
validationMode, String rootName,
+  String rootNS)
+  throws CompileFailure {
+
+DaffodilDataProcessorFactory dmp = new DaffodilDataProcessorFactory();
+boolean isPrecompiled = schemaFileURI.toString().endsWith(".bin");
+if (isPrecompiled) {
+  if (Objects.nonNull(rootName) && !rootName.isEmpty()) {
+// A usage error. You shouldn't supply the name and optionally 
namespace if loading
+// precompiled schema because those are built into it. Should be null 
or "".
+logger.warn("Root element name '{}' is ignored when used with 
precompiled DFDL schema.",
+rootName);
+  }
+  try {
+dmp.loadSchema(schemaFileURI);
+  } catch (IOException | InvalidParserException e) {
+throw new CompileFailure(e);

Review Comment:
   My thought here would be to fail as quickly as possible.  If the DFDL schema 
can't be read, I'm assuming that we cannot proceed, so throwing an exception 
would be the right thing to do IMHO.  With that said, we should make sure we 
provide a good error message that would explain what went wrong. 
   One of the issues we worked on for a while with Drill was that it would fail 
and you'd get a stack trace w/o a clear idea of what the actual issue is and 
how to rectify it. 
   





> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>

[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-09 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17804917#comment-17804917
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on PR #2836:
URL: https://github.com/apache/drill/pull/2836#issuecomment-1883962208

   > @mbeckerle With respect to style, I tried to reply to that comment, but 
the thread won't let me. In any event, Drill classes will typically start with 
the constructor, then have whatever methods are appropriate for the class. The 
logger creation usually happens before the constructor. I think all of your 
other classes followed this format, so the one or two that didn't kind of 
jumped out at me.
   
   @cgivre I believe the style issues are all fixed. The build did not get any 
codestyle issues. 




> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-07 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803984#comment-17803984
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on PR #2836:
URL: https://github.com/apache/drill/pull/2836#issuecomment-1880110452

   > > @mbeckerle I had a thought about your TODO list. See inline.
   > > > This is ready for a next review. All the scalar types are now 
implemented with typed setter calls.
   > > > The prior review comments have all been addressed I believe.
   > > > Remaining things to do include:
   > > > 
   > > > 1. How to get the compiled DFDL schema object so it can be loaded by 
daffodil out at the distributed Drill nodes.
   > > 
   > > 
   > > I was thinking about this and I remembered something that might be 
useful. Drill has support for User Defined Functions (UDF) which are written in 
Java. To add a UDF to Drill, you also have to write some Java classes in a 
particular way, and include the JARs. Much like the DFDL class files, the UDF 
JARs must be accessible to all nodes of a Drill cluster.
   > > Additionally, Drill has the capability of adding UDFs dynamically. This 
feature was added here: #574. Anyway, I wonder if we could use a similar 
mechanism to load and store the DFDL files so that they are accessible to all 
Drill nodes. What do you think?
   > 
   > Excellent: So drill has all the machinery, it's just a question of 
repackaging it so it's available for this usage pattern, which is a bit 
different from Drill's UDFs, but also very similar.
   > 
   > There are two user scenarios which we can call production and test.
   > 
   > 1. Production: binary compiled DFDL schema file + code jars for Daffodil's 
own UDFs and "layers" plugins. This should, ideally, cache the compiled schema 
and not reload it for every query (at every node), but keep the same loaded 
instance in memory in a persistant JVM image on each node. For large production 
DFDL schemas this is the only sensible mechanism as it can take minutes to 
compile large DFDL schemas.
   > 2. Test: on-the-fly centralized compilation of DFDL schema (from a 
combination of jars and files) to create and cache (to avoid recompiling) the 
binary compiled DFDL schema file. Then using that compiled binary file, as item 
1. For small DFDL schemas this can be fast enough for production use. Ideally, 
if the DFDL schema is unchanged this would reuse the compiled binary file, but 
that's an optimization that may not matter much.
   > 
   > Kinds of objects involved are:
   > 
   > * Daffodil plugin code jars
   > * DFDL schema jars
   > * DFDL schema files (just not packaged into a jar)
   > * Daffodil compiled schema binary file
   > * Daffodil config file - parameters, tunables, and options needed at 
compile time and/or runtime
   > 
   > Code jars: Daffodil provides two extension features for DFDL users - DFDL 
UDFs and DFDL 'layers' (ex: plug-ins for uudecode, or gunzip algorithms used in 
part of the data format). Those are ordinary compiled class files in jars, so 
in all scenarios those jars are needed on the node class path if the DFDL 
schema uses them. Daffodil dynamically finds and loads these from the classpath 
in regular Java Service-Provider Interface (SPI) mechanisms.
   > 
   > Schema jars: Daffodil packages DFDL schema files (source files i.e., 
mySchema.dfdl.xsd) into jar files to allow inter-schema dependencies to be 
managed using ordinary jar/java-style managed dependencies. Tools like sbt and 
maven can express the dependencies of one schema on another, grab and pull them 
together, etc. Daffodil has a resolver so when one schema file referenes 
another with include/import it searches the class path directories and jars for 
the files.
   > 
   > Schema jars are only needed centrally when compiling the schema to a 
binary file. All references to the jar files for inter-schema file references 
are compiled into the compiled binary file.
   > 
   > It is possible for one DFDL schema 'project' to define a DFDL schema, 
along with the code for a plugin like a Daffodil UDF or layer. In that case the 
one jar created is both a code jar and a schema jar. The schema jar aspects are 
used when the schema is compiled and ignored at Daffodil runtime. The code jar 
aspects are used at Daffodil run time and ignored at schema compilation time. 
So such a jar that is both code and schema jar needs to be on the class path in 
both places, but there's no interaction of the two things.
   > 
   > Binary Compiled Schema File: Centrally, DFDL schemas in files and/or jars 
are compiled to create a single binary object which can be reloaded in order to 
actually use the schema to parse/unparse data.
   > 
   > * These binary files are tied to a specific version+build of Daffodil. 
(They are just a java object serialization of the runtime data structures used 
by Daffodil).
   > * Once reloaded into a JVM to 

[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-07 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803983#comment-17803983
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on PR #2836:
URL: https://github.com/apache/drill/pull/2836#issuecomment-1880109717

   @mbeckerle 
   With respect to style, I tried to reply to that comment, but the thread 
won't let me.   In any event, Drill classes will typically start with the 
constructor, then have whatever methods are appropriate for the class.  The 
logger creation usually happens before the constructor.  I think all of your 
other classes followed this format, so the one or two that didn't kind of 
jumped out at me. 




> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-05 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803594#comment-17803594
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on code in PR #2836:
URL: https://github.com/apache/drill/pull/2836#discussion_r1442993784


##
contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java:
##
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.daffodil.schema;
+
+import org.apache.daffodil.runtime1.api.ChoiceMetadata;
+import org.apache.daffodil.runtime1.api.ComplexElementMetadata;
+import org.apache.daffodil.runtime1.api.ElementMetadata;
+import org.apache.daffodil.runtime1.api.InfosetSimpleElement;
+import org.apache.daffodil.runtime1.api.MetadataHandler;
+import org.apache.daffodil.runtime1.api.SequenceMetadata;
+import org.apache.daffodil.runtime1.api.SimpleElementMetadata;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.record.metadata.MapBuilder;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Stack;
+
+/**
+ * This class transforms a DFDL/Daffodil schema into a Drill Schema.
+ */
+public class DrillDaffodilSchemaVisitor extends MetadataHandler {
+  private static final Logger logger = 
LoggerFactory.getLogger(DrillDaffodilSchemaVisitor.class);
+  /**
+   * Unfortunately, SchemaBuilder and MapBuilder, while similar, do not share 
a base class so we
+   * have a stack of MapBuilders, and when empty we use the SchemaBuilder

Review Comment:
   Note that this awkwardness effectively doubles the code size of things that 
interface to Drill. 
   
   This duplication of similar behavior for schema and map builders (and 
rowWriters and mapWriters) is expected and typical of systems that start from a 
tabular view of the data world and later add the features needed for 
hierachical data. Nevertheless it is awkward when one is dealing entirely with 
hierarchical data. 
   
   A MetaBuilder that does the map thing if the builder is a map, and the 
schema thing if the builder is a schema would eliminate this. This could be an 
interface mixed into both SchemaBuilder and MapBuilder (could also be called 
MapBuilderLike). 
   
   The same discontinuity at the base holds for RowWriter vs. MapWriter in the 
runtime handling of data. Again it doubles the code size/complexity, every fix 
goes in 2 places, etc. A MapWriterLike interface could be factored out.
   
   Maybe we should build such mechanisms to avoid this, and then use them to 
improve this Daffodil plugin?
   



##
contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java:
##
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.daffodil.schema;
+
+import org.apache.daffodil.japi.InvalidParserException;
+import org.apache.daffodil.japi.DataProcessor;
+import org.apache.daffodil.runtime1.api.PrimitiveType;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import com.google.common.annotations.VisibleForTesting;
+import 

[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-05 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803592#comment-17803592
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on PR #2836:
URL: https://github.com/apache/drill/pull/2836#issuecomment-1878896878

   > @mbeckerle I had a thought about your TODO list. See inline.
   > 
   > > This is ready for a next review. All the scalar types are now 
implemented with typed setter calls.
   > > The prior review comments have all been addressed I believe.
   > > Remaining things to do include:
   > > 
   > > 1. How to get the compiled DFDL schema object so it can be loaded by 
daffodil out at the distributed Drill nodes.
   > 
   > I was thinking about this and I remembered something that might be useful. 
Drill has support for User Defined Functions (UDF) which are written in Java. 
To add a UDF to Drill, you also have to write some Java classes in a particular 
way, and include the JARs. Much like the DFDL class files, the UDF JARs must be 
accessible to all nodes of a Drill cluster.
   > 
   > Additionally, Drill has the capability of adding UDFs dynamically. This 
feature was added here: #574. Anyway, I wonder if we could use a similar 
mechanism to load and store the DFDL files so that they are accessible to all 
Drill nodes. What do you think?
   
   Excellent: So drill has all the machinery, it's just a question of 
repackaging it so it's available for this usage pattern, which is a bit 
different from Drill's UDFs, but also very similar. 
   
   There are two user scenarios which we can call production and test.
   
   1. Production: binary compiled DFDL schema file + code jars for Daffodil's 
own UDFs and "layers" plugins. This should, ideally, cache the compiled schema 
and not reload it for every query (at every node), but keep the same loaded 
instance in memory in a persistant JVM image on each node. For large production 
DFDL schemas this is the only sensible mechanism as it can take minutes to 
compile large DFDL schemas. 
   
   2. Test: on-the-fly centralized compilation of DFDL schema (from a 
combination of jars and files) to create and cache (to avoid recompiling) the 
binary compiled DFDL schema file. Then using that compiled binary file, as item 
1. For small DFDL schemas this can be fast enough for production use. Ideally, 
if the DFDL schema is unchanged this would reuse the compiled binary file, but 
that's an optimization that may not matter much. 
   
   Kinds of objects involved are:
   
   - Daffodil plugin code jars
   - DFDL schema jars
   - DFDL schema files (just not packaged into a jar)
   - Daffodil compiled schema binary file
   - Daffodil config file - parameters, tunables, and options needed at compile 
time and/or runtime
   
   Code jars: Daffodil provides two extension features for DFDL users - DFDL 
UDFs and DFDL 'layers' (ex: plug-ins for uudecode, or gunzip algorithms used in 
part of the data format). Those are ordinary compiled class files in jars, so 
in all scenarios those jars are needed on the node class path if the DFDL 
schema uses them. Daffodil dynamically finds and loads these from the classpath 
in regular Java Service-Provider Interface (SPI) mechanisms. 
   
   Schema jars: Daffodil packages DFDL schema files (source files i.e., 
mySchema.dfdl.xsd) into jar files to allow inter-schema dependencies to be 
managed using ordinary jar/java-style managed dependencies. Tools like sbt and 
maven can express the dependencies of one schema on another, grab and pull them 
together, etc. Daffodil has a resolver so when one schema file referenes 
another with include/import it searches the class path directories and jars for 
the files. 
   
   Schema jars are only needed centrally when compiling the schema to a binary 
file. All references to the jar files for inter-schema file references are 
compiled into the compiled binary file. 
   
   It is possible for one DFDL schema 'project' to define a DFDL schema, along 
with the code for a plugin like a Daffodil UDF or layer. In that case the one 
jar created is both a code jar and a schema jar. The schema jar aspects are 
used when the schema is compiled and ignored at Daffodil runtime. The code jar 
aspects are used at Daffodil run time and ignored at schema compilation time. 
So such a jar that is both code and schema jar needs to be on the class path in 
both places, but there's no interaction of the two things. 
   
   Binary Compiled Schema File: Centrally, DFDL schemas in files and/or jars 
are compiled to create a single binary object which can be reloaded in order to 
actually use the schema to parse/unparse data. 
   
   - These binary files are tied to a specific version+build of Daffodil. (They 
are just a java object serialization of the runtime data structures used by 
Daffodil). 
   - Once reloaded into a JVM to create a Daffodil DataProcessor object, that 
object is 

[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-04 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803367#comment-17803367
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on PR #2836:
URL: https://github.com/apache/drill/pull/2836#issuecomment-1877962587

   @mbeckerle I had a thought about your TODO list.   See inline.
   
   > This is ready for a next review. All the scalar types are now implemented 
with typed setter calls.
   > 
   > The prior review comments have all been addressed I believe.
   > 
   > Remaining things to do include:
   > 
   > 1. How to get the compiled DFDL schema object so it can be loaded by 
daffodil out at the distributed Drill nodes.
   
   I was thinking about this and I remembered something that might be useful.  
Drill has support for User Defined Functions (UDF) which are written in Java.  
To add a UDF to Drill, you also have to write some Java classes in a particular 
way, and include the JARs.   Much like the DFDL class files, the UDF JARs must 
be accessible to all nodes of a Drill cluster. 
   
   Additionally, Drill has the capability of adding UDFs dynamically.   This 
feature was added here: https://github.com/apache/drill/pull/574.  Anyway, I 
wonder if we could use a similar mechanism to load and store the DFDL files so 
that they are accessible to all Drill nodes.  What do you think?
   
   
   > 2. Test of nilled values (and more tests generally to show deeply nested 
and repeating nested objects work.)
   > 3. Errors - revisit every place errors are detected or thrown to make sure 
these are being done the right way for DFDL schema compilation and runtime 
errors as well.
   
   




> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-04 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803353#comment-17803353
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on PR #2836:
URL: https://github.com/apache/drill/pull/2836#issuecomment-1877915796

   This is ready for a next review. All the scalar types are now implemented 
with typed setter calls. 
   
   The prior review comments have all been addressed I believe.
   
   Remaining things to do include:
   
   1. How to get the compiled DFDL schema object so it can be loaded by 
daffodil out at the distributed Drill nodes.
   2. Test of nilled values (and more tests generally to show deeply nested and 
repeating nested objects work.)
   3. Errors - revisit every place errors are detected or thrown to make sure 
these are being done the right way for DFDL schema compilation and runtime 
errors as well. 
   




> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-04 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803351#comment-17803351
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on code in PR #2836:
URL: https://github.com/apache/drill/pull/2836#discussion_r1442338979


##
contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java:
##
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.daffodil;
+
+import org.apache.daffodil.runtime1.api.ComplexElementMetadata;
+import org.apache.daffodil.runtime1.api.ElementMetadata;
+import org.apache.daffodil.runtime1.api.InfosetArray;
+import org.apache.daffodil.runtime1.api.InfosetComplexElement;
+import org.apache.daffodil.japi.infoset.InfosetOutputter;
+import org.apache.daffodil.runtime1.api.InfosetSimpleElement;
+import org.apache.daffodil.runtime1.api.PrimitiveType;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.exec.physical.resultSet.RowSetLoader;
+import org.apache.drill.exec.record.metadata.ColumnMetadata;
+import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils;
+import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaVisitor;
+import org.apache.drill.exec.vector.accessor.ArrayWriter;
+import org.apache.drill.exec.vector.accessor.ColumnWriter;
+import org.apache.drill.exec.vector.accessor.ObjectType;
+import org.apache.drill.exec.vector.accessor.TupleWriter;
+import org.apache.drill.exec.vector.complex.writer.BaseWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Stack;
+
+/**
+ * Adapts Daffodil parser infoset event calls to Drill writer calls
+ * to fill in Drill data rows.
+ */
+public class DaffodilDrillInfosetOutputter
+extends InfosetOutputter {
+
+  private boolean isOriginalRoot() {
+boolean result = currentTupleWriter() == rowSetWriter;

Review Comment:
   Next commit will have files reformatted based on the eclipse settings in the 
dev-support/formatter directory, as implemented by intelliJ IDEA when those 
settings were imported. 





> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)


[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-04 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803337#comment-17803337
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on PR #2836:
URL: https://github.com/apache/drill/pull/2836#issuecomment-1877814024

   Let me respond between the paragraphs
   
   On Tue, Jan 2, 2024 at 11:49 PM Paul Rogers ***@***.***>
   wrote:
   
   > Hi Mike,
   >
   > Just jumping in with a random thought. Drill has accumulated a number of
   > schema systems: Parquet metadata cache, HMS, Drill's own metastore,
   > "provided schema", and now DFDL. All provide ways of defining data: be it
   > Parquet, JSON, CSV or whatever. One can't help but wonder, should some
   > future version try to reduce this variation somewhat? Maybe map all the
   > variations to DFDL? Map DFDL to Drill's own mechanisms?
   >
   > Well we can dream can't we :-)
   
   I can contribute the ideas in
   
https://daffodil.apache.org/dev/design-notes/Proposed-DFDL-Standard-Profile.md
   which
   is an effort to restrict the DFDL language so that schemas written in DFDL
   can work more smoothly with Drill, NiFi, Spark, Flink, Beam, etc. etc.
   
   DFDL's data model is too restrictive to be "the model" for Drill since
   Drill wants to query even unstructured data like XML without schema. DFDL's
   data model is targeted only at structured data.
   
   Drill's data model and APIs seem optimized for streaming block-buffered
   top-level rows of data (the EVF API does anyway). Top level row-sets are
   first-class citizens, as are the fields of said rows. Fields containing
   arrays of maps (possibly containing more arrays of maps, and so on deeply
   nested) are not handled uniformly with the same block-buffered "row-like"
   mechanisms. The APIs are similar, but not polymorphic. I suspect that the
   block-buffered data streaming in Drill only happens for top-level rows,
   because there is no test for whether or not you are allowed to create
   another array item like there is a test for creating another row in a
   row-set writer. There is no control inversion where an adapter must give
   back control to Drill in the middle of trying to write an array.
   
   The current Drill/Daffodil interface I've created doesn't cope with
   header-body* files (ex: PCAP which format has a header record, then
   repeating packet records) as it has no way of returning just the body
   records as top level rows. So while there exists a DFDL schema for PCAP,
   you really do want to use a dedicated PCAP Drill adapter which hands back
   rows, not Daffodil which will parse the entire PCAP file into one huge row
   containing a monster sub-array of packets, where each packet is a map
   within the array of maps. This is ok for now as many files where DFDL is
   used are not like PCAP. They are just repeating records of one format with
   no special whole-file header. Eventually we will want to be able to supply
   a path to tell the Drill/Daffodil interface that you only want the packet
   array as the output rows. (This is the unimplemented Daffodil "onPath(...)"
   API feature. We haven't needed this yet for DFDL work in cybersecurity, but
   it was anticipated 10+ years back as essential for data integration.)
   
   
   > Drill uses two kinds of metadata: schema definitions and file metadata
   > used
   > for scan pruning. Schema information could be used at plan time (to
   > provide
   > column types), but certainly at scan time (to "discover" the defined
   > schema.) File metadata is used primarily at plan time to work out how to
   > distribute work.
   
   
   DFDL has zero notion of file metadata. It doesn't know whether data even
   comes from a file or an open TCP socket. Daffodil/DFDL just sees a
   java.io.InputStream.
   The schema it uses for a given file is specified by the API call. Daffodil
   does nothing itself to try to find or identify any schema.
   
   So we're "blank slate" on this issue with DFDL.
   
   
   >
   >
   > A bit of background on scan pruning. Back in the day, it was common to
   > have
   > thousands or millions of files in Hadoop to scan: this was why tools like
   > Drill were distributed: divide and conquer. And, of course, the fastest
   > scan is to skip files that we know can't contain the information we want.
   > File metadata captures this information outside of the files themselves.
   > HMS was the standard solution in the Hadoop days. (Amazon Glue, for S3, is
   > evidently based on HMS.)
   >
   > For example, Drill's Parquet metadata cache, the Drill metastore and HMS
   > all provide both schema and file metadata information. The schema
   > information mainly helped with schema evolution: over time, different
   > files
   > have different sets of columns. File metadata provides information *about*
   > the file, such as the data ranges stored in each file. For Parquet, 

[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-04 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803334#comment-17803334
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on code in PR #2836:
URL: https://github.com/apache/drill/pull/2836#discussion_r1442278098


##
contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java:
##
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.daffodil;
+
+import org.apache.daffodil.runtime1.api.ComplexElementMetadata;
+import org.apache.daffodil.runtime1.api.ElementMetadata;
+import org.apache.daffodil.runtime1.api.InfosetArray;
+import org.apache.daffodil.runtime1.api.InfosetComplexElement;
+import org.apache.daffodil.japi.infoset.InfosetOutputter;
+import org.apache.daffodil.runtime1.api.InfosetSimpleElement;
+import org.apache.daffodil.runtime1.api.PrimitiveType;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.exec.physical.resultSet.RowSetLoader;
+import org.apache.drill.exec.record.metadata.ColumnMetadata;
+import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils;
+import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaVisitor;
+import org.apache.drill.exec.vector.accessor.ArrayWriter;
+import org.apache.drill.exec.vector.accessor.ColumnWriter;
+import org.apache.drill.exec.vector.accessor.ObjectType;
+import org.apache.drill.exec.vector.accessor.TupleWriter;
+import org.apache.drill.exec.vector.complex.writer.BaseWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Stack;
+
+/**
+ * Adapts Daffodil parser infoset event calls to Drill writer calls
+ * to fill in Drill data rows.
+ */
+public class DaffodilDrillInfosetOutputter
+extends InfosetOutputter {
+
+  private boolean isOriginalRoot() {
+boolean result = currentTupleWriter() == rowSetWriter;
+if (result)
+  assert(tupleWriterStack.size() == 1);
+return result;
+  }
+
+  /**
+   * True if the next startComplex call will be for the
+   * DFDL infoset root element whose children are the columns of
+   * the row set.
+   */
+  private boolean isRootElement = true;
+
+  /**
+   * Stack that is used only if we have sub-structures that are not
+   * simple-type fields of the row.
+   */
+  private final Stack tupleWriterStack = new Stack<>();
+
+  private final Stack arrayWriterStack = new Stack<>();
+
+  private TupleWriter currentTupleWriter() {
+return tupleWriterStack.peek();
+  }
+
+  private ArrayWriter currentArrayWriter() {
+return arrayWriterStack.peek();
+  }
+
+
+  private static final Logger logger = 
LoggerFactory.getLogger(DaffodilDrillInfosetOutputter.class);
+
+  private DaffodilDrillInfosetOutputter() {} // no default constructor
+
+  private RowSetLoader rowSetWriter;
+
+  public DaffodilDrillInfosetOutputter(RowSetLoader writer) {
+this.rowSetWriter = writer;
+this.tupleWriterStack.push(writer);
+  }
+
+  @Override
+  public void reset() {
+tupleWriterStack.clear();
+tupleWriterStack.push(rowSetWriter);
+arrayWriterStack.clear();
+this.isRootElement = true;
+checkCleanState();
+  }
+
+  private void checkCleanState() {
+assert(isOriginalRoot());
+assert(arrayWriterStack.isEmpty());
+assert(isRootElement);
+  }
+
+  @Override
+  public void startDocument() {
+checkCleanState();
+  }
+
+  @Override
+  public void endDocument() {
+checkCleanState();
+  }
+
+  private String colName(ElementMetadata md) {
+return DrillDaffodilSchemaVisitor.makeColumnName(md);
+  }
+
+  @Override
+  public void startSimple(InfosetSimpleElement ise) {
+assert (!isRootElement);
+ElementMetadata md = ise.metadata();
+String colName = colName(md);
+ColumnWriter cw;
+if (md.isArray()) {
+  // A simple type array
+  assert(!arrayWriterStack.isEmpty());
+  cw = currentArrayWriter().scalar();
+} else {
+  // A simple element within a map
+  // Note the map itself might be an array
+  // but we don't care about that here.
+  cw = 

[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-04 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803249#comment-17803249
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

mbeckerle commented on code in PR #2836:
URL: https://github.com/apache/drill/pull/2836#discussion_r1442045159


##
contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java:
##
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.daffodil;
+
+import org.apache.daffodil.runtime1.api.ComplexElementMetadata;
+import org.apache.daffodil.runtime1.api.ElementMetadata;
+import org.apache.daffodil.runtime1.api.InfosetArray;
+import org.apache.daffodil.runtime1.api.InfosetComplexElement;
+import org.apache.daffodil.japi.infoset.InfosetOutputter;
+import org.apache.daffodil.runtime1.api.InfosetSimpleElement;
+import org.apache.daffodil.runtime1.api.PrimitiveType;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.exec.physical.resultSet.RowSetLoader;
+import org.apache.drill.exec.record.metadata.ColumnMetadata;
+import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils;
+import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaVisitor;
+import org.apache.drill.exec.vector.accessor.ArrayWriter;
+import org.apache.drill.exec.vector.accessor.ColumnWriter;
+import org.apache.drill.exec.vector.accessor.ObjectType;
+import org.apache.drill.exec.vector.accessor.TupleWriter;
+import org.apache.drill.exec.vector.complex.writer.BaseWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Stack;
+
+/**
+ * Adapts Daffodil parser infoset event calls to Drill writer calls
+ * to fill in Drill data rows.
+ */
+public class DaffodilDrillInfosetOutputter
+extends InfosetOutputter {
+
+  private boolean isOriginalRoot() {
+boolean result = currentTupleWriter() == rowSetWriter;
+if (result)
+  assert(tupleWriterStack.size() == 1);
+return result;
+  }
+
+  /**
+   * True if the next startComplex call will be for the
+   * DFDL infoset root element whose children are the columns of
+   * the row set.
+   */
+  private boolean isRootElement = true;
+
+  /**
+   * Stack that is used only if we have sub-structures that are not
+   * simple-type fields of the row.
+   */
+  private final Stack tupleWriterStack = new Stack<>();
+
+  private final Stack arrayWriterStack = new Stack<>();
+
+  private TupleWriter currentTupleWriter() {
+return tupleWriterStack.peek();
+  }
+
+  private ArrayWriter currentArrayWriter() {
+return arrayWriterStack.peek();
+  }
+
+
+  private static final Logger logger = 
LoggerFactory.getLogger(DaffodilDrillInfosetOutputter.class);
+
+  private DaffodilDrillInfosetOutputter() {} // no default constructor
+
+  private RowSetLoader rowSetWriter;
+
+  public DaffodilDrillInfosetOutputter(RowSetLoader writer) {
+this.rowSetWriter = writer;
+this.tupleWriterStack.push(writer);
+  }
+
+  @Override
+  public void reset() {
+tupleWriterStack.clear();
+tupleWriterStack.push(rowSetWriter);
+arrayWriterStack.clear();
+this.isRootElement = true;
+checkCleanState();
+  }
+
+  private void checkCleanState() {
+assert(isOriginalRoot());
+assert(arrayWriterStack.isEmpty());
+assert(isRootElement);
+  }
+
+  @Override
+  public void startDocument() {
+checkCleanState();
+  }
+
+  @Override
+  public void endDocument() {
+checkCleanState();
+  }
+
+  private String colName(ElementMetadata md) {
+return DrillDaffodilSchemaVisitor.makeColumnName(md);
+  }
+
+  @Override
+  public void startSimple(InfosetSimpleElement ise) {
+assert (!isRootElement);
+ElementMetadata md = ise.metadata();
+String colName = colName(md);
+ColumnWriter cw;
+if (md.isArray()) {
+  // A simple type array
+  assert(!arrayWriterStack.isEmpty());
+  cw = currentArrayWriter().scalar();
+} else {
+  // A simple element within a map
+  // Note the map itself might be an array
+  // but we don't care about that here.
+  cw = 

[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-04 Thread ASF GitHub Bot (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17803180#comment-17803180
 ] 

ASF GitHub Bot commented on DRILL-8474:
---

cgivre commented on code in PR #2836:
URL: https://github.com/apache/drill/pull/2836#discussion_r1441799950


##
contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java:
##
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.daffodil;
+
+import org.apache.daffodil.runtime1.api.ComplexElementMetadata;
+import org.apache.daffodil.runtime1.api.ElementMetadata;
+import org.apache.daffodil.runtime1.api.InfosetArray;
+import org.apache.daffodil.runtime1.api.InfosetComplexElement;
+import org.apache.daffodil.japi.infoset.InfosetOutputter;
+import org.apache.daffodil.runtime1.api.InfosetSimpleElement;
+import org.apache.daffodil.runtime1.api.PrimitiveType;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.exec.physical.resultSet.RowSetLoader;
+import org.apache.drill.exec.record.metadata.ColumnMetadata;
+import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils;
+import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaVisitor;
+import org.apache.drill.exec.vector.accessor.ArrayWriter;
+import org.apache.drill.exec.vector.accessor.ColumnWriter;
+import org.apache.drill.exec.vector.accessor.ObjectType;
+import org.apache.drill.exec.vector.accessor.TupleWriter;
+import org.apache.drill.exec.vector.complex.writer.BaseWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Stack;
+
+/**
+ * Adapts Daffodil parser infoset event calls to Drill writer calls
+ * to fill in Drill data rows.
+ */
+public class DaffodilDrillInfosetOutputter
+extends InfosetOutputter {
+
+  private boolean isOriginalRoot() {
+boolean result = currentTupleWriter() == rowSetWriter;
+if (result)
+  assert(tupleWriterStack.size() == 1);
+return result;
+  }
+
+  /**
+   * True if the next startComplex call will be for the
+   * DFDL infoset root element whose children are the columns of
+   * the row set.
+   */
+  private boolean isRootElement = true;
+
+  /**
+   * Stack that is used only if we have sub-structures that are not
+   * simple-type fields of the row.
+   */
+  private final Stack tupleWriterStack = new Stack<>();
+
+  private final Stack arrayWriterStack = new Stack<>();
+
+  private TupleWriter currentTupleWriter() {
+return tupleWriterStack.peek();
+  }
+
+  private ArrayWriter currentArrayWriter() {
+return arrayWriterStack.peek();
+  }
+
+
+  private static final Logger logger = 
LoggerFactory.getLogger(DaffodilDrillInfosetOutputter.class);
+
+  private DaffodilDrillInfosetOutputter() {} // no default constructor
+
+  private RowSetLoader rowSetWriter;
+
+  public DaffodilDrillInfosetOutputter(RowSetLoader writer) {
+this.rowSetWriter = writer;
+this.tupleWriterStack.push(writer);
+  }
+
+  @Override
+  public void reset() {
+tupleWriterStack.clear();
+tupleWriterStack.push(rowSetWriter);
+arrayWriterStack.clear();
+this.isRootElement = true;
+checkCleanState();
+  }
+
+  private void checkCleanState() {
+assert(isOriginalRoot());
+assert(arrayWriterStack.isEmpty());
+assert(isRootElement);
+  }
+
+  @Override
+  public void startDocument() {
+checkCleanState();
+  }
+
+  @Override
+  public void endDocument() {
+checkCleanState();
+  }
+
+  private String colName(ElementMetadata md) {
+return DrillDaffodilSchemaVisitor.makeColumnName(md);
+  }
+
+  @Override
+  public void startSimple(InfosetSimpleElement ise) {
+assert (!isRootElement);
+ElementMetadata md = ise.metadata();
+String colName = colName(md);
+ColumnWriter cw;
+if (md.isArray()) {
+  // A simple type array
+  assert(!arrayWriterStack.isEmpty());
+  cw = currentArrayWriter().scalar();
+} else {
+  // A simple element within a map
+  // Note the map itself might be an array
+  // but we don't care about that here.
+  cw = 

[jira] [Commented] (DRILL-8474) Add Daffodil Format Plugin

2024-01-03 Thread Charles Givre (Jira)


[ 
https://issues.apache.org/jira/browse/DRILL-8474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17802191#comment-17802191
 ] 

Charles Givre commented on DRILL-8474:
--

[https://github.com/apache/drill/pull/2836]

> Add Daffodil Format Plugin
> --
>
> Key: DRILL-8474
> URL: https://issues.apache.org/jira/browse/DRILL-8474
> Project: Apache Drill
>  Issue Type: New Feature
>Affects Versions: 1.21.1
>Reporter: Charles Givre
>Priority: Major
> Fix For: 1.22.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)