Re: [PR] [CALCITE-2040] Create adapter for Apache Arrow [calcite]

via GitHub Sun, 25 Feb 2024 11:30:41 -0800


asolimando commented on code in PR #3666:
URL: https://github.com/apache/calcite/pull/3666#discussion_r1501882199



##########
arrow/src/test/java/org/apache/calcite/adapter/arrow/ArrowAdapterTest.java:
##########
@@ -0,0 +1,503 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.calcite.adapter.arrow;
+
+import org.apache.calcite.jdbc.JavaTypeFactoryImpl;
+import org.apache.calcite.rel.type.RelDataType;
+import org.apache.calcite.rel.type.RelDataTypeFactory;
+import org.apache.calcite.rel.type.RelDataTypeSystem;
+import org.apache.calcite.schema.Table;
+import org.apache.calcite.test.CalciteAssert;
+import org.apache.calcite.util.Sources;
+
+import org.apache.arrow.gandiva.evaluator.Projector;
+import org.apache.arrow.gandiva.exceptions.GandivaException;
+import org.apache.arrow.gandiva.expression.ExpressionTree;
+import org.apache.arrow.vector.types.pojo.Schema;
+
+import com.google.common.collect.ImmutableMap;
+
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+
+import static org.hamcrest.CoreMatchers.is;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
+
+/**
+ * Tests for the Apache Arrow adapter.
+ */
+class ArrowAdapterTest {
+  private static Map<String, String> arrow;
+  private static File arrowDataDirectory;
+  private static boolean hasGandivaSupport = detectGandivaSupport();
+
+  ArrowAdapterTest() {
+    assumeTrue(hasGandivaSupport, "gandiva not supported on this platform, 
skipping tests");
+  }
+
+  /**
+   * Gandiva (used to implement arrow filtering / projection) does not 
currently distribute
+   * a binary that is compatible with M1 macs on maven central.
+   * see <a 
href="https://issues.apache.org/jira/browse/ARROW-16608";>ARROW-16608</a>.
+   *
+   * @return true if we believe that gandiva is supported on this platform and 
we can run the tests
+   */
+  private static boolean detectGandivaSupport() {
+    try {
+      Schema emptySchema = new Schema(new ArrayList<>(), null);
+      List<ExpressionTree> expressions = new ArrayList<>();
+      Projector.make(emptySchema, expressions);
+    } catch (GandivaException e) {
+      // this is ok -- we'll always hit this because of an empty expression
+      // the fact that we got here, is indicative that the JNI library was 
loaded properly
+      return true;
+    } catch (UnsatisfiedLinkError e) {
+      return false;
+    }
+    return true;
+  }
+
+  @BeforeAll
+  static void initializeArrowState(@TempDir Path sharedTempDir) throws 
IOException, SQLException {
+    URL modelUrl =
+        
Objects.requireNonNull(ArrowAdapterTest.class.getResource("/arrow-model.json"), 
"url");
+    Path sourceModelFilePath = Sources.of(modelUrl).file().toPath();
+    Path modelFileTarget = sharedTempDir.resolve("arrow-model.json");
+    Files.copy(sourceModelFilePath, modelFileTarget);
+
+    Path arrowFilesDirectory = sharedTempDir.resolve("arrow");
+    Files.createDirectory(arrowFilesDirectory);
+    arrowDataDirectory = arrowFilesDirectory.toFile();
+
+    File dataLocationFile = 
arrowFilesDirectory.resolve("arrowdata.arrow").toFile();
+    ArrowData arrowDataGenerator = new ArrowData();
+    arrowDataGenerator.writeArrowData(dataLocationFile);
+    arrowDataGenerator.writeScottEmpData(arrowFilesDirectory);
+
+    arrow = ImmutableMap.of("model", 
modelFileTarget.toAbsolutePath().toString());
+  }
+
+  /** Test to read an Arrow file and check its field names. */
+  @Test void testArrowSchema() {
+    ArrowSchema arrowSchema = new ArrowSchema(arrowDataDirectory);
+    Map<String, Table> tableMap = arrowSchema.getTableMap();
+    RelDataTypeFactory typeFactory =
+        new JavaTypeFactoryImpl(RelDataTypeSystem.DEFAULT);
+    RelDataType relDataType = 
tableMap.get("ARROWDATA").getRowType(typeFactory);
+
+    assertThat(relDataType.getFieldNames().get(0), is("intField"));
+    assertThat(relDataType.getFieldNames().get(1), is("stringField"));
+    assertThat(relDataType.getFieldNames().get(2), is("floatField"));
+  }
+
+  @Test void testArrowProjectAllFields() {
+    String sql = "select * from arrowdata\n";
+    String plan = "PLAN=ArrowToEnumerableConverter\n"
+        + "  ArrowTableScan(table=[[ARROW, ARROWDATA]], fields=[[0, 1, 2, 
3]])\n\n";
+    String result = "intField=0; stringField=0; floatField=0.0; longField=0\n"
+        + "intField=1; stringField=1; floatField=1.0; longField=1\n"
+        + "intField=2; stringField=2; floatField=2.0; longField=2\n"
+        + "intField=3; stringField=3; floatField=3.0; longField=3\n"
+        + "intField=4; stringField=4; floatField=4.0; longField=4\n"
+        + "intField=5; stringField=5; floatField=5.0; longField=5\n";
+    CalciteAssert.that()
+        .with(arrow)
+        .query(sql)
+        .limit(6)
+        .returns(result)
+        .explainContains(plan);
+  }
+
+  @Test void testArrowProjectSingleField() {
+    String sql = "select \"intField\" from arrowdata\n";
+    String result = "intField=0\nintField=1\nintField=2\n"
+        + "intField=3\nintField=4\nintField=5\n";
+    String plan = "PLAN=ArrowToEnumerableConverter\n"
+        + "  ArrowProject(intField=[$0])\n"
+        + "    ArrowTableScan(table=[[ARROW, ARROWDATA]], fields=[[0, 1, 2, 
3]])\n\n";
+    CalciteAssert.that()
+        .with(arrow)
+        .query(sql)
+        .limit(6)
+        .returns(result)
+        .explainContains(plan);
+  }
+
+  @Test void testArrowProjectTwoFields() {
+    String sql = "select \"intField\", \"stringField\" from arrowdata\n";
+    String result = "intField=0; stringField=0\n"
+        + "intField=1; stringField=1\n"
+        + "intField=2; stringField=2\n"
+        + "intField=3; stringField=3\n"
+        + "intField=4; stringField=4\n"
+        + "intField=5; stringField=5\n";
+    String plan = "PLAN=ArrowToEnumerableConverter\n"
+        + "  ArrowProject(intField=[$0], stringField=[$1])\n"
+        + "    ArrowTableScan(table=[[ARROW, ARROWDATA]], fields=[[0, 1, 2, 
3]])\n\n";
+    CalciteAssert.that()
+        .with(arrow)
+        .query(sql)
+        .limit(6)
+        .returns(result)
+        .explainContains(plan);
+  }
+
+  @Test void testArrowProjectFieldsWithIntegerFilter() {
+    String sql = "select \"intField\", \"stringField\"\n"
+        + "from arrowdata\n"
+        + "where \"intField\" < 4";
+    String result = "intField=0; stringField=0\n"
+        + "intField=1; stringField=1\n"
+        + "intField=2; stringField=2\n"
+        + "intField=3; stringField=3\n";
+    String plan = "PLAN=ArrowToEnumerableConverter\n"
+        + "  ArrowProject(intField=[$0], stringField=[$1])\n"
+        + "    ArrowFilter(condition=[<($0, 4)])\n"
+        + "      ArrowTableScan(table=[[ARROW, ARROWDATA]], fields=[[0, 1, 2, 
3]])\n\n";
+    CalciteAssert.that()
+        .with(arrow)
+        .query(sql)
+        .limit(4)
+        .returns(result)
+        .explainContains(plan);
+  }
+
+  @Test void testArrowProjectFieldsWithMultipleFilters() {
+    String sql = "select \"intField\", \"stringField\"\n"
+        + "from arrowdata\n"
+        + "where \"intField\"=12 and \"stringField\"='12'";
+    String plan = "PLAN=ArrowToEnumerableConverter\n"
+        + "  ArrowProject(intField=[$0], stringField=[$1])\n"
+        + "    ArrowFilter(condition=[AND(=($0, 12), =($1, '12'))])\n"
+        + "      ArrowTableScan(table=[[ARROW, ARROWDATA]], fields=[[0, 1, 2, 
3]])\n\n";
+    String result = "intField=12; stringField=12\n";
+    CalciteAssert.that()
+        .with(arrow)
+        .query(sql)
+        .limit(3)
+        .returns(result)
+        .explainContains(plan);
+  }
+
+  @Test void testArrowProjectFieldsWithFloatFilter() {
+    String sql = "select * from arrowdata\n"
+        + " where \"floatField\"=15.0";
+    String plan = "PLAN=ArrowToEnumerableConverter\n"
+        + "  ArrowFilter(condition=[=(CAST($2):DOUBLE, 15.0)])\n"
+        + "    ArrowTableScan(table=[[ARROW, ARROWDATA]], fields=[[0, 1, 2, 
3]])\n\n";
+    String result = "intField=15; stringField=15; floatField=15.0; 
longField=15\n";
+    CalciteAssert.that()
+        .with(arrow)
+        .query(sql)
+        .returns(result)
+        .explainContains(plan);
+  }
+
+  @Test void testArrowProjectFieldsWithFilterOnLaterBatch() {
+    String sql = "select \"intField\"\n"
+        + "from arrowdata\n"
+        + "where \"intField\"=25";
+    String plan = "PLAN=ArrowToEnumerableConverter\n"
+        + "  ArrowProject(intField=[$0])\n"
+        + "    ArrowFilter(condition=[=($0, 25)])\n"
+        + "      ArrowTableScan(table=[[ARROW, ARROWDATA]], fields=[[0, 1, 2, 
3]])\n\n";
+    String result = "intField=25\n";
+    CalciteAssert.that()
+        .with(arrow)
+        .query(sql)
+        .returns(result)
+        .explainContains(plan);
+  }
+
+  // TODO: test a table whose field names contain spaces,

Review Comment:
   There seems to be consensus in the [Jira 
ticket](https://issues.apache.org/jira/browse/CALCITE-2040) to get the 
consolidated version of what we currently have in, and file tickets for the 
current bugs and limitations. 
   
   So we can either implement those tests and disable them if not working (and 
of course filing an associated ticket for "family" of failing tests, like "add 
support for queries over multiple tables" etc.), or directly file a ticket and 
add what's in the TODO as part of the description, so we don't lose the context 
those comments provide, before deleting them.
   
   Let's see what will be better when we reach a more mature state.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@calcite.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] [CALCITE-2040] Create adapter for Apache Arrow [calcite]

Reply via email to