[jira] [Commented] (DRILL-7233) Format Plugin for HDF5

ASF GitHub Bot (Jira) Wed, 08 Jan 2020 07:25:45 -0800


    [ 
https://issues.apache.org/jira/browse/DRILL-7233?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17010765#comment-17010765
 ]


ASF GitHub Bot commented on DRILL-7233:
---------------------------------------

arina-ielchiieva commented on pull request #1778: DRILL-7233: Format Plugin for 
HDF5
URL: https://github.com/apache/drill/pull/1778#discussion_r364287412
 
 

 ##########
 File path: 
contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Format.java
 ##########
 @@ -0,0 +1,907 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.hdf5;
+
+import org.apache.drill.categories.RowSetTests;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.exec.ExecTest;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.exec.rpc.RpcException;
+import org.apache.drill.exec.store.dfs.ZipCodec;
+import org.apache.drill.test.ClusterTest;
+import org.apache.drill.exec.physical.rowSet.RowSet;
+import org.apache.drill.exec.physical.rowSet.RowSetBuilder;
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.rowSet.RowSetComparison;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+@Category(RowSetTests.class)
+public class TestHDF5Format extends ClusterTest {
+
+  @BeforeClass
+  public static void setup() throws Exception {
+    ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher));
+
+    HDF5FormatConfig formatConfig = new HDF5FormatConfig();
+    cluster.defineFormat("dfs", "hdf5", formatConfig);
+    cluster.defineFormat("cp", "hdf5", formatConfig);
+    dirTestWatcher.copyResourceToRoot(Paths.get("hdf5/"));
+  }
+
+  @Test
+  public void testExplicitQuery() throws RpcException {
+    String sql = "SELECT path, data_type, file_name FROM cp.`hdf5/dset.h5`";
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("path", TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL)
+      .add("data_type", TypeProtos.MinorType.VARCHAR, 
TypeProtos.DataMode.OPTIONAL)
+      .add("file_name", TypeProtos.MinorType.VARCHAR, 
TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow("/dset", "DATASET", "dset.h5")
+      .build();
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testStarQuery() throws Exception {
+    List<Integer> t1 = Arrays.asList(1, 2, 3, 4, 5, 6);
+    List<Integer> t2 = Arrays.asList(7, 8, 9, 10, 11, 12);
+    List<Integer> t3 = Arrays.asList(13, 14, 15, 16, 17, 18);
+    List<Integer> t4 = Arrays.asList(19, 20, 21, 22, 23, 24);
+    List<List<Integer>> finalList = new ArrayList<>();
+    finalList.add(t1);
+    finalList.add(t2);
+    finalList.add(t3);
+    finalList.add(t4);
+
+    testBuilder()
+      .sqlQuery("SELECT * FROM cp.`hdf5/dset.h5`")
+      .unOrdered()
+      .baselineColumns("path", "data_type", "file_name", "int_data")
+      .baselineValues("/dset", "DATASET", "dset.h5", finalList)
+      .go();
+  }
+
+  @Test
+  public void testSimpleExplicitQuery() throws Exception {
+    List<Integer> t1 = Arrays.asList(1, 2, 3, 4, 5, 6);
+    List<Integer> t2 = Arrays.asList(7, 8, 9, 10, 11, 12);
+    List<Integer> t3 = Arrays.asList(13, 14, 15, 16, 17, 18);
+    List<Integer> t4 = Arrays.asList(19, 20, 21, 22, 23, 24);
+    List<List<Integer>> finalList = new ArrayList<>();
+    finalList.add(t1);
+    finalList.add(t2);
+    finalList.add(t3);
+    finalList.add(t4);
+
+    testBuilder()
+      .sqlQuery("SELECT path, data_type, file_name, int_data FROM 
cp.`hdf5/dset.h5`")
+      .ordered()
+      .baselineColumns("path", "data_type", "file_name", "int_data")
+      .baselineValues("/dset", "DATASET", "dset.h5", finalList)
+      .go();
+  }
+
+  @Test
+  public void testFlattenColumnQuery() throws RpcException {
+    String sql = "SELECT data[0] AS col1,\n" +
+            "data[1] as col2,\n" +
+            "data[2] as col3\n" +
+            "FROM \n" +
+            "(\n" +
+            "SELECT FLATTEN(double_data) AS data \n" +
+            "FROM cp.`hdf5/browsing.h5` WHERE path='/groupB/dmat'\n" +
+            ")";
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("col1", TypeProtos.MinorType.FLOAT8, TypeProtos.DataMode.OPTIONAL)
+      .add("col2", TypeProtos.MinorType.FLOAT8, TypeProtos.DataMode.OPTIONAL)
+      .add("col3", TypeProtos.MinorType.FLOAT8, TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(1.1, 2.2, 3.3)
+      .addRow(4.4, 5.5, 6.6)
+      .addRow(7.7, 8.8, 9.9)
+      .build();
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
+
+ @Test
+  public void testFilterWithNonProjectedFieldQuery() throws Exception {
+    String sql = "SELECT `path` FROM cp.`hdf5/browsing.h5` WHERE 
data_type='DATASET'";
+
+   RowSet results = client.queryBuilder().sql(sql).rowSet();
+   TupleMetadata expectedSchema = new SchemaBuilder()
+     .add("path", TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL)
+     .buildSchema();
+
+   RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+     .addRow("/groupA/date")
+     .addRow("/groupA/string")
+     .addRow("/groupB/dmat")
+     .addRow("/groupB/inarr")
+     .build();
+   new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testFloat32ScalarQuery() throws Exception {
+    String sql = "SELECT flatten(float32) AS float_col\n" +
+            "FROM cp.`hdf5/scalar.h5`\n" +
+            "WHERE path='/datatype/float32'";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("float_col", TypeProtos.MinorType.FLOAT8, 
TypeProtos.DataMode.REQUIRED)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(-3.4028234663852886E38)
+      .addRow(1.0)
+      .addRow(2.0)
+      .addRow(3.0)
+      .addRow(4.0)
+      .addRow(5.0)
+      .addRow(6.0)
+      .addRow(7.0)
+      .addRow(8.0)
+      .addRow(3.4028234663852886E38)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testFlattenFloat32ScalarQuery() throws Exception {
+    String sql = "SELECT * FROM table(cp.`hdf5/scalar.h5` (type => 'hdf5', 
defaultPath => '/datatype/float32'))";
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("float32", TypeProtos.MinorType.FLOAT8, 
TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(-3.4028234663852886E38)
+      .addRow(1.0)
+      .addRow(2.0)
+      .addRow(3.0)
+      .addRow(4.0)
+      .addRow(5.0)
+      .addRow(6.0)
+      .addRow(7.0)
+      .addRow(8.0)
+      .addRow(3.4028234663852886E38)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testFloat64ScalarQuery() throws Exception {
+    String sql = "SELECT flatten(float64) AS float_col\n" +
+            "FROM cp.`hdf5/scalar.h5`\n" +
+            "WHERE path='/datatype/float64'";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("float_col", TypeProtos.MinorType.FLOAT8, 
TypeProtos.DataMode.REQUIRED)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(-1.7976931348623157E308)
+      .addRow(1.0)
+      .addRow(2.0)
+      .addRow(3.0)
+      .addRow(4.0)
+      .addRow(5.0)
+      .addRow(6.0)
+      .addRow(7.0)
+      .addRow(8.0)
+      .addRow(1.7976931348623157E308)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testFlattenFloat64ScalarQuery() throws Exception {
+    String sql = "SELECT * FROM table(cp.`hdf5/scalar.h5` (type => 'hdf5', 
defaultPath => '/datatype/float64'))";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("float64", TypeProtos.MinorType.FLOAT8, 
TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(-1.7976931348623157E308)
+      .addRow(1.0)
+      .addRow(2.0)
+      .addRow(3.0)
+      .addRow(4.0)
+      .addRow(5.0)
+      .addRow(6.0)
+      .addRow(7.0)
+      .addRow(8.0)
+      .addRow(1.7976931348623157E308)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testInt32ScalarQuery() throws Exception {
+    String sql = "SELECT flatten(int32) AS int_col\n" +
+            "FROM cp.`hdf5/scalar.h5`\n" +
+            "WHERE path='/datatype/int32'";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("int_col", TypeProtos.MinorType.INT, TypeProtos.DataMode.REQUIRED)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(-2147483648)
+      .addRow(1)
+      .addRow(2)
+      .addRow(3)
+      .addRow(4)
+      .addRow(5)
+      .addRow(6)
+      .addRow(7)
+      .addRow(8)
+      .addRow(2147483647)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testFlattenInt32ScalarQuery() throws Exception {
+    String sql = "SELECT * FROM table(cp.`hdf5/scalar.h5` (type => 'hdf5', 
defaultPath => '/datatype/int32'))";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("int32", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(-2147483648)
+      .addRow(1)
+      .addRow(2)
+      .addRow(3)
+      .addRow(4)
+      .addRow(5)
+      .addRow(6)
+      .addRow(7)
+      .addRow(8)
+      .addRow(2147483647)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testInt64ScalarQuery() throws Exception {
+    String sql = "SELECT flatten(int64) AS long_col\n" +
+            "FROM cp.`hdf5/scalar.h5`\n" +
+            "WHERE path='/datatype/int64'";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("long_col", TypeProtos.MinorType.BIGINT, 
TypeProtos.DataMode.REQUIRED)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(-9223372036854775808L)
+      .addRow(1L)
+      .addRow(2L)
+      .addRow(3L)
+      .addRow(4L)
+      .addRow(5L)
+      .addRow(6L)
+      .addRow(7L)
+      .addRow(8L)
+      .addRow(9223372036854775807L)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+
+  }
+
+  @Test
+  public void testFlattenInt64ScalarQuery() throws Exception {
+    String sql = "SELECT * FROM table(cp.`hdf5/scalar.h5` (type => 'hdf5', 
defaultPath => '/datatype/int64'))";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("int64", TypeProtos.MinorType.BIGINT, TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(-9223372036854775808L)
+      .addRow(1L)
+      .addRow(2L)
+      .addRow(3L)
+      .addRow(4L)
+      .addRow(5L)
+      .addRow(6L)
+      .addRow(7L)
+      .addRow(8L)
+      .addRow(9223372036854775807L)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testStringScalarQuery() throws Exception {
+    String sql = "SELECT flatten(s10) AS string_col\n" +
+            "FROM cp.`hdf5/scalar.h5`\n" +
+            "WHERE path='/datatype/s10'";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("string_col", TypeProtos.MinorType.VARCHAR, 
TypeProtos.DataMode.REQUIRED)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow("a         ")
+      .addRow("")
+      .addRow("")
+      .addRow("")
+      .addRow("")
+      .addRow("")
+      .addRow("")
+      .addRow("")
+      .addRow("")
+      .addRow("abcdefghij")
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testFlattenStringScalarQuery() throws Exception {
+    String sql = "SELECT * FROM table(cp.`hdf5/scalar.h5` (type => 'hdf5', 
defaultPath => '/datatype/s10'))";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("s10", TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow("a         ")
+      .addRow("")
+      .addRow("")
+      .addRow("")
+      .addRow("")
+      .addRow("")
+      .addRow("")
+      .addRow("")
+      .addRow("")
+      .addRow("abcdefghij")
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+
+  @Test
+  public void testUnicodeScalarQuery() throws Exception {
+    String sql = "SELECT flatten(unicode) AS string_col\n" +
+            "FROM cp.`hdf5/scalar.h5`\n" +
+            "WHERE path='/datatype/unicode'";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("string_col", TypeProtos.MinorType.VARCHAR, 
TypeProtos.DataMode.REQUIRED)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow("a")
+      .addRow("Ελληνικά")
+      .addRow("日本語")
+      .addRow("العربية")
+      .addRow("экземпляр")
+      .addRow("סקרן")
+      .addRow("")
+      .addRow("")
+      .addRow("")
+      .addRow("abcdefghij")
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testUnicodeFlattenScalarQuery() throws Exception {
+    String sql = "SELECT * FROM table(cp.`hdf5/scalar.h5` (type => 'hdf5', 
defaultPath => '/datatype/unicode'))";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("unicode", TypeProtos.MinorType.VARCHAR, 
TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow("a")
+      .addRow("Ελληνικά")
+      .addRow("日本語")
+      .addRow("العربية")
+      .addRow("экземпляр")
+      .addRow("סקרן")
+      .addRow("")
+      .addRow("")
+      .addRow("")
+      .addRow("abcdefghij")
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+
+  @Test
+  public void test1DScalarQuery() throws Exception {
+    String sql = "SELECT FLATTEN(`1D`) AS int_col\n" +
+            "FROM cp.`hdf5/scalar.h5`\n" +
+            "WHERE path='/nd/1D'\n" +
+            "LIMIT 5";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("int_col", TypeProtos.MinorType.INT, TypeProtos.DataMode.REQUIRED)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(-2147483648)
+      .addRow(1)
+      .addRow(2)
+      .addRow(3)
+      .addRow(4)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void test1DFlattenScalarQuery() throws Exception {
+    String sql = "SELECT * FROM table(cp.`hdf5/scalar.h5` (type => 'hdf5', 
defaultPath => '/nd/1D')) LIMIT 5";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("1D", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(-2147483648)
+      .addRow(1)
+      .addRow(2)
+      .addRow(3)
+      .addRow(4)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+
+  @Test
+  public void test2DFlattenScalarQuery() throws Exception {
+    String sql = "SELECT int_col_0, int_col_1 FROM table(cp.`hdf5/scalar.h5` 
(type => 'hdf5', defaultPath => '/nd/2D'))";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("int_col_0", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .add("int_col_1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(-2147483648, 1)
+      .addRow(10, 11)
+      .addRow(20, 21)
+      .addRow(30, 31)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void test2DScalarQuery() throws Exception {
+    String sql = "SELECT int_data[0] AS col1,\n" +
+      "int_data[1] AS col2\n" +
+      "FROM\n" +
+      "(\n" +
+      "SELECT flatten(int_data) AS int_data\n" +
+      "FROM cp.`hdf5/scalar.h5`\n" +
+      "WHERE path='/nd/2D'\n" +
+      ") AS t1";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("col1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .add("col2", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(-2147483648, 1)
+      .addRow(10, 11)
+      .addRow(20, 21)
+      .addRow(30, 31)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+
+  @Test
+  public void test3DScalarQuery() throws Exception {
+    String sql = "SELECT int_data[0] AS col1,\n" +
+      "int_data[1] AS col2\n" +
+      "FROM\n" +
+      "(\n" +
+      "SELECT flatten(int_data) AS int_data\n" +
+      "FROM cp.`hdf5/scalar.h5`\n" +
+      "WHERE path='/nd/3D'\n" +
+      ") AS t1";
+
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("col1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .add("col2", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(-2147483648, 1)
+      .addRow(2, 3)
+      .addRow(4, 5)
+      .addRow(6, 7)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void test3DFlattenScalarQuery() throws Exception {
+    String sql = "SELECT int_col_0, int_col_1 FROM table(cp.`hdf5/scalar.h5` 
(type => 'hdf5', defaultPath => '/nd/3D'))";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("int_col_0", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .add("int_col_1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(-2147483648, 1)
+      .addRow(2, 3)
+      .addRow(4, 5)
+      .addRow(6, 7)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void test4DScalarQuery() throws Exception {
+    String sql = "SELECT int_data[0] AS col1,\n" +
+            "int_data[1] AS col2\n" +
+            "FROM\n" +
+            "(\n" +
+            "SELECT flatten(int_data) AS int_data\n" +
+            "FROM cp.`hdf5/scalar.h5`\n" +
+            "WHERE path='/nd/4D'\n" +
+            ") AS t1";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("col1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .add("col2", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(-2147483648, 1)
+      .addRow(2, 3)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void test4DFlattenScalarQuery() throws Exception {
+    String sql = "SELECT int_col_0, int_col_1 FROM table(cp.`hdf5/scalar.h5` 
(type => 'hdf5', defaultPath => '/nd/4D')) LIMIT 5";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("int_col_0", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .add("int_col_1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(-2147483648, 1)
+      .addRow(2, 3)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testNonScalarIntQuery() throws Exception {
+    String sql = "SELECT flatten(t1.compound_data.`field 1`) as field_1\n" +
+            "FROM cp.`hdf5/non-scalar.h5` AS t1\n" +
+            "LIMIT 5";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("field_1", TypeProtos.MinorType.INT, TypeProtos.DataMode.REQUIRED)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(0)
+      .addRow(1)
+      .addRow(2)
+      .addRow(3)
+      .addRow(4)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+  @Test
+  public void testNonScalarFloatQuery() throws Exception {
+    String sql = "SELECT flatten(t1.compound_data.`field 2`) as field_2\n" +
+            "FROM cp.`hdf5/non-scalar.h5` AS t1\n" +
+            "LIMIT 5";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("field_2", TypeProtos.MinorType.FLOAT8, 
TypeProtos.DataMode.REQUIRED)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(0.0)
+      .addRow(1.0)
+      .addRow(2.0)
+      .addRow(3.0)
+      .addRow(4.0)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+ @Test
+  public void testNonScalarStringQuery() throws Exception {
+    String sql = "SELECT flatten(t1.compound_data.`field 3`) as field_3\n" +
+            "FROM cp.`hdf5/non-scalar.h5` AS t1\n" +
+            "LIMIT 5";
+
+   RowSet results = client.queryBuilder().sql(sql).rowSet();
+   TupleMetadata expectedSchema = new SchemaBuilder()
+     .add("field_3", TypeProtos.MinorType.VARCHAR, 
TypeProtos.DataMode.REQUIRED)
+     .buildSchema();
+
+   RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+     .addRow("0")
+     .addRow("1")
+     .addRow("2")
+     .addRow("3")
+     .addRow("4")
+     .build();
+
+   new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testAttributes() throws Exception {
+    String sql = "SELECT path, file_name\n" +
+            "FROM cp.`hdf5/browsing.h5` AS t1 WHERE t1.attributes.`important` 
= false";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("path", TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL)
+      .add("file_name", TypeProtos.MinorType.VARCHAR, 
TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow("/groupB", "browsing.h5")
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testStarProjectDatasetQuery() throws Exception {
+    String sql = "SELECT * \n"+
+      "FROM \n" +
+      "table(cp.`hdf5/dset.h5` (type => 'hdf5', defaultPath => '/dset'))";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("int_col_0", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .add("int_col_1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .add("int_col_2", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .add("int_col_3", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .add("int_col_4", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .add("int_col_5", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(1,2,3,4,5,6)
+      .addRow(7,8,9,10,11,12)
+      .addRow(13,14,15,16,17,18)
+      .addRow(19,20,21,22,23,24)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testExplicitProjectDatasetQuery() throws Exception {
+    String sql = "SELECT int_col_0, int_col_1, int_col_2, int_col_3, 
int_col_4\n"+
+      "FROM \n" +
+      "table(cp.`hdf5/dset.h5` (type => 'hdf5', defaultPath => '/dset'))";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("int_col_0", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .add("int_col_1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .add("int_col_2", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .add("int_col_3", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .add("int_col_4", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(1,2,3,4,5)
+      .addRow(7,8,9,10,11)
+      .addRow(13,14,15,16,17)
+      .addRow(19,20,21,22,23)
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testCompoundStarQuery() throws Exception {
+
+    String sql = "SELECT * FROM table(cp.`hdf5/non-scalar.h5` (type => 'hdf5', 
defaultPath => '/compound')) LIMIT 5";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("field_1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+      .add("field_2", TypeProtos.MinorType.FLOAT8, 
TypeProtos.DataMode.OPTIONAL)
+      .add("field_3", TypeProtos.MinorType.VARCHAR, 
TypeProtos.DataMode.OPTIONAL)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(0, 0.0, "0")
+      .addRow(1, 1.0, "1")
+      .addRow(2, 2.0, "2")
+      .addRow(3, 3.0, "3")
+      .addRow(4, 4.0, "4")
+      .build();
+
+    new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
+  }
+
+  @Test
+  public void testCompoundExplicitQuery() throws Exception {
 
 Review comment:
   You can check `TestLogReader#testTableFunctionWithSchema` though there is no 
need to enable `store.table.use_schema_file` option since schema provided via 
function does not depend on this option.
 
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Format Plugin for HDF5
> ----------------------
>
>                 Key: DRILL-7233
>                 URL: https://issues.apache.org/jira/browse/DRILL-7233
>             Project: Apache Drill
>          Issue Type: New Feature
>    Affects Versions: 1.17.0
>            Reporter: Charles Givre
>            Assignee: Charles Givre
>            Priority: Major
>              Labels: doc-impacting
>             Fix For: 1.18.0
>
>
> h2. Drill HDF5 Format Plugin
> h2. 
> Per wikipedia, Hierarchical Data Format (HDF) is a set of file formats 
> designed to store and organize large amounts of data. Originally developed at 
> the National Center for Supercomputing Applications, it is supported by The 
> HDF Group, a non-profit corporation whose mission is to ensure continued 
> development of HDF5 technologies and the continued accessibility of data 
> stored in HDF.
> This plugin enables Apache Drill to query HDF5 files.
> h3. Configuration
> There are three configuration variables in this plugin:
> type: This should be set to hdf5.
> extensions: This is a list of the file extensions used to identify HDF5 
> files. Typically HDF5 uses .h5 or .hdf5 as file extensions. This defaults to 
> .h5.
> defaultPath:
> h3. Example Configuration
> h3. 
> For most uses, the configuration below will suffice to enable Drill to query 
> HDF5 files.
> {{"hdf5": {
>       "type": "hdf5",
>       "extensions": [
>         "h5"
>       ],
>       "defaultPath": null
>     }}}
> h3. Usage
> Since HDF5 can be viewed as a file system within a file, a single file can 
> contain many datasets. For instance, if you have a simple HDF5 file, a star 
> query will produce the following result:
> {{apache drill> select * from dfs.test.`dset.h5`;
> +-------+-----------+-----------+--------------------------------------------------------------------------+
> | path  | data_type | file_name |                                 int_data    
>                              |
> +-------+-----------+-----------+--------------------------------------------------------------------------+
> | /dset | DATASET   | dset.h5   | 
> [[1,2,3,4,5,6],[7,8,9,10,11,12],[13,14,15,16,17,18],[19,20,21,22,23,24]] |
> +-------+-----------+-----------+--------------------------------------------------------------------------+}}
> The actual data in this file is mapped to a column called int_data. In order 
> to effectively access the data, you should use Drill's FLATTEN() function on 
> the int_data column, which produces the following result.
> {{apache drill> select flatten(int_data) as int_data from dfs.test.`dset.h5`;
> +---------------------+
> |      int_data       |
> +---------------------+
> | [1,2,3,4,5,6]       |
> | [7,8,9,10,11,12]    |
> | [13,14,15,16,17,18] |
> | [19,20,21,22,23,24] |
> +---------------------+}}
> Once you have the data in this form, you can access it similarly to how you 
> might access nested data in JSON or other files.
> {{apache drill> SELECT int_data[0] as col_0,
> . .semicolon> int_data[1] as col_1,
> . .semicolon> int_data[2] as col_2
> . .semicolon> FROM ( SELECT flatten(int_data) AS int_data
> . . . . . .)> FROM dfs.test.`dset.h5`
> . . . . . .)> );
> +-------+-------+-------+
> | col_0 | col_1 | col_2 |
> +-------+-------+-------+
> | 1     | 2     | 3     |
> | 7     | 8     | 9     |
> | 13    | 14    | 15    |
> | 19    | 20    | 21    |
> +-------+-------+-------+}}
> Alternatively, a better way to query the actual data in an HDF5 file is to 
> use the defaultPath field in your query. If the defaultPath field is defined 
> in the query, or via the plugin configuration, Drill will only return the 
> data, rather than the file metadata.
> ** Note: Once you have determined which data set you are querying, it is 
> advisable to use this method to query HDF5 data. **
> You can set the defaultPath variable in either the plugin configuration, or 
> at query time using the table() function as shown in the example below:
> {{SELECT * 
> FROM table(dfs.test.`dset.h5` (type => 'hdf5', defaultPath => '/dset'))}}
> This query will return the result below:
> {{apache drill> SELECT * FROM table(dfs.test.`dset.h5` (type => 'hdf5', 
> defaultPath => '/dset'));
> +-----------+-----------+-----------+-----------+-----------+-----------+
> | int_col_0 | int_col_1 | int_col_2 | int_col_3 | int_col_4 | int_col_5 |
> +-----------+-----------+-----------+-----------+-----------+-----------+
> | 1         | 2         | 3         | 4         | 5         | 6         |
> | 7         | 8         | 9         | 10        | 11        | 12        |
> | 13        | 14        | 15        | 16        | 17        | 18        |
> | 19        | 20        | 21        | 22        | 23        | 24        |
> +-----------+-----------+-----------+-----------+-----------+-----------+
> 4 rows selected (0.223 seconds)}}
> If the data in defaultPath is a column, the column name will be the last part 
> of the path. If the data is multidimensional, the columns will get a name of 
> <data_type>_col_n . Therefore a column of integers will be called int_col_1.
> h3. Attributes
> Occasionally, HDF5 paths will contain attributes. Drill will map these to a 
> map data structure called attributes, as shown in the query below.
> {{apache drill> SELECT attributes FROM dfs.test.`browsing.h5`;
> +----------------------------------------------------------------------------------+
> |                                    attributes                               
>      |
> +----------------------------------------------------------------------------------+
> | {}                                                                          
>      |
> | {"__TYPE_VARIANT__":"TIMESTAMP_MILLISECONDS_SINCE_START_OF_THE_EPOCH"}      
>      |
> | {}                                                                          
>      |
> | {}                                                                          
>      |
> | 
> {"important":false,"__TYPE_VARIANT__timestamp__":"TIMESTAMP_MILLISECONDS_SINCE_START_OF_THE_EPOCH","timestamp":1550033296762}
>  |
> | {}                                                                          
>      |
> | {}                                                                          
>      |
> | {}                                                                          
>      |
> +----------------------------------------------------------------------------------+
> 8 rows selected (0.292 seconds)}}
> You can access the individual fields within the attributes map by using the 
> structure table.map.key. Note that you will have to give the table an alias 
> for this to work properly.
> {{apache drill> SELECT path, data_type, file_name  
> FROM dfs.test.`browsing.h5` AS t1 WHERE t1.attributes.important = false;
> +---------+-----------+-------------+
> |  path   | data_type |  file_name  |
> +---------+-----------+-------------+
> | /groupB | GROUP     | browsing.h5 |
> +---------+-----------+-------------+}}
> h3. Known Limitations
> h3. 
> There are several limitations with the HDF5 format plugin in Drill.
> * Drill cannot read unsigned 64 bit integers. When the plugin encounters this 
> data type, it will write an INFO message to the log.
> * Drill cannot read compressed fields in HDF5 files.
> * HDF5 files can contain nested data sets of up to n dimensions. Since Drill 
> works best with two dimensional data, datasets with more than two dimensions 
> are flattened.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

[jira] [Commented] (DRILL-7233) Format Plugin for HDF5

Reply via email to