Ok I think I got a working version now:

        t = ibis.table([("a", "int64"), ("b", "int64")], name="table0")

        test_table_0 = pa.Table.from_pydict({"a": [1, 2, 3], "b": [4, 5,
6]})



        result = self.compiler.compile(t)



        def table_provider(names):

            if not names:

                raise Exception("No names provided")

            elif names[0] == 'table0':

                return test_table_0

            else:

                raise Exception(f"Unknown table name {names}")



        reader =
pa.substrait.run_query(pa.py_buffer(result.SerializeToString()),
table_provider)

        result_table = reader.read_all()



        self.assertTrue(result_table == test_table_0)

First successful run with ibis/substrait/acero - Hooray

On Wed, Oct 5, 2022 at 2:33 PM Li Jin <ice.xell...@gmail.com> wrote:

> Hmm. Thanks for the update - Now I searched the code more, it seems
> perhaps I should be using "compile" rather than "translate";
>
>
> https://github.com/ibis-project/ibis-substrait/blob/main/ibis_substrait/compiler/core.py#L82
>
> Let me try some more
>
> On Wed, Oct 5, 2022 at 1:42 PM Will Jones <will.jones...@gmail.com> wrote:
>
>> Hi Li Jin,
>>
>> The original segfault seems to occur because you are passing a Python
>> bytes
>> object and not a PyArrow Buffer object. You can wrap the bytes object
>> using
>> pa.py_buffer():
>>
>> pa.substrait.run_query(pa.py_buffer(result_bytes), table_provider)
>>
>>
>> That being said, when I run your full example with that, we now get a
>> different error similar to what you get when you pass in through JSON:
>>
>> Traceback (most recent call last):
>>   File "<stdin>", line 1, in <module>
>>   File "pyarrow/_substrait.pyx", line 140, in pyarrow._substrait.run_query
>>     c_reader = GetResultValue(c_res_reader)
>>   File "pyarrow/error.pxi", line 144, in
>> pyarrow.lib.pyarrow_internal_check_status
>>     return check_status(status)
>>   File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
>>     raise ArrowInvalid(message)
>> pyarrow.lib.ArrowInvalid: ExecPlan has no node
>>
>> /Users/willjones/Documents/arrows/arrow/cpp/src/arrow/engine/substrait/util.cc:82
>>  plan_->Validate()
>>
>> /Users/willjones/Documents/arrows/arrow/cpp/src/arrow/engine/substrait/util.cc:131
>>  executor.Execute()
>>
>>
>> We get the same error even if I add operations onto the plan:
>>
>> result = translate(t.group_by("a").mutate(z = t.b.sum()), compiler)
>> print(result)
>>
>>
>> project {
>>   input {
>>     read {
>>       base_schema {
>>         names: "a"
>>         names: "b"
>>         struct {
>>           types {
>>             i64 {
>>               nullability: NULLABILITY_NULLABLE
>>             }
>>           }
>>           types {
>>             i64 {
>>               nullability: NULLABILITY_NULLABLE
>>             }
>>           }
>>           nullability: NULLABILITY_REQUIRED
>>         }
>>       }
>>       named_table {
>>         names: "table0"
>>       }
>>     }
>>   }
>>   expressions {
>>     selection {
>>       direct_reference {
>>         struct_field {
>>         }
>>       }
>>       root_reference {
>>       }
>>     }
>>   }
>>   expressions {
>>     selection {
>>       direct_reference {
>>         struct_field {
>>           field: 1
>>         }
>>       }
>>       root_reference {
>>       }
>>     }
>>   }
>>   expressions {
>>     window_function {
>>       function_reference: 1
>>       partitions {
>>         selection {
>>           direct_reference {
>>             struct_field {
>>             }
>>           }
>>           root_reference {
>>           }
>>         }
>>       }
>>       upper_bound {
>>         unbounded {
>>         }
>>       }
>>       lower_bound {
>>         unbounded {
>>         }
>>       }
>>       phase: AGGREGATION_PHASE_INITIAL_TO_RESULT
>>       output_type {
>>         i64 {
>>           nullability: NULLABILITY_NULLABLE
>>         }
>>       }
>>       arguments {
>>         value {
>>           selection {
>>             direct_reference {
>>               struct_field {
>>                 field: 1
>>               }
>>             }
>>             root_reference {
>>             }
>>           }
>>         }
>>       }
>>     }
>>   }
>> }
>>
>>
>> Full reproduction:
>>
>> import pyarrow as pa
>> import pyarrow.substrait
>> import ibis
>> from ibis_substrait.compiler.core import SubstraitCompiler
>> from ibis_substrait.compiler.translate import translate
>>
>>
>> compiler = SubstraitCompiler()
>>
>>
>> t = ibis.table([("a", "int64"), ("b", "int64")], name="table0")
>> result = translate(t.group_by("a").mutate(z = t.b.sum()), compiler)
>>
>> def table_provider(names):
>>     if not names:
>>         raise Exception("No names provided")
>>     elif names[0] == 'table0':
>>         return test_table_0
>>     else:
>>         raise Exception(f"Unknown table name {names}")
>>
>>
>> test_table_0 = pa.Table.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
>>
>> result_bytes = result.SerializeToString()
>>
>> pa.substrait.run_query(pa.py_buffer(result_bytes), table_provider)
>>
>> Best,
>>
>> Will Jones
>>
>> On Tue, Oct 4, 2022 at 12:30 PM Li Jin <ice.xell...@gmail.com> wrote:
>>
>> > For reference, this is the "relations" entry that I was referring to:
>> >
>> >
>> https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_substrait.py#L186
>> >
>> > On Tue, Oct 4, 2022 at 3:28 PM Li Jin <ice.xell...@gmail.com> wrote:
>> >
>> > > So I made some progress with updated code:
>> > >
>> > >         t = ibis.table([("a", "int64"), ("b", "int64")],
>> name="table0")
>> > >
>> > >         test_table_0 = pa.Table.from_pydict({"a": [1, 2, 3], "b": [4,
>> 5,
>> > > 6]})
>> > >
>> > >
>> > >
>> > >         result = translate(t, self.compiler)
>> > >
>> > >
>> > >
>> > >         def table_provider(names):
>> > >
>> > >             if not names:
>> > >
>> > >                 raise Exception("No names provided")
>> > >
>> > >             elif names[0] == 'table0':
>> > >
>> > >                 return test_table_0
>> > >
>> > >             else:
>> > >
>> > >                 raise Exception(f"Unknown table name {names}")
>> > >
>> > >
>> > >
>> > >         print(result)
>> > >
>> > >         result_buf =
>> > > pa._substrait._parse_json_plan(tobytes(MessageToJson(result)))
>> > >
>> > >
>> > >
>> > >         pa.substrait.run_query(result_buf, table_provider)
>> > >
>> > > I think now the plan is passed properly and I got a "ArrowInvalid:
>> Empty
>> > > substrait plan is passed"
>> > >
>> > >
>> > > Looking the plan reproduces by ibis-substrait, it looks like doesn't
>> > match
>> > > the expected format of Acero consumer. In particular, it looks like
>> the
>> > > plan produced by ibis-substrait doesn't have a "relations" entry - any
>> > > thoughts on how this can be fixed? (I don't know if I am using the API
>> > > wrong or some format inconsistency between the two)
>> > >
>> > > On Tue, Oct 4, 2022 at 1:54 PM Li Jin <ice.xell...@gmail.com> wrote:
>> > >
>> > >> Hi,
>> > >>
>> > >> I am testing integration between ibis-substrait and Acero but hit a
>> > >> segmentation fault. I think this might be cause the way I am
>> > >> integrating these two libraries are wrong, here is my code:
>> > >>
>> > >> Li Jin
>> > >> 1:51 PM (1 minute ago)
>> > >> to me
>> > >>
>> > >> class BasicTests(unittest.TestCase):
>> > >>
>> > >>     """Test basic features"""
>> > >>
>> > >>
>> > >>
>> > >>
>> > >>
>> > >>     @classmethod
>> > >>
>> > >>     def setUpClass(cls):
>> > >>
>> > >>         cls.compiler = SubstraitCompiler()
>> > >>
>> > >>
>> > >>
>> > >>     def test_named_table(self):
>> > >>
>> > >>         """Test basic"""
>> > >>
>> > >>         t = ibis.table([("a", "int64"), ("b", "int64")],
>> name="table0")
>> > >>
>> > >>         result = translate(t, self.compiler)
>> > >>
>> > >>
>> > >>
>> > >>         def table_provider(names):
>> > >>
>> > >>             if not names:
>> > >>
>> > >>                 raise Exception("No names provided")
>> > >>
>> > >>             elif names[0] == 'table0':
>> > >>
>> > >>                 return test_table_0
>> > >>
>> > >>             else:
>> > >>
>> > >>                 raise Exception(f"Unknown table name {names}")
>> > >>
>> > >>
>> > >>
>> > >>         test_table_0 = pa.Table.from_pydict({"a": [1, 2, 3], "b":
>> [4, 5,
>> > >> 6]})
>> > >>
>> > >>
>> > >>
>> > >>         print(type(result))
>> > >>
>> > >>         print(result)
>> > >>
>> > >>         result_bytes = result.SerializeToString()
>> > >>
>> > >>
>> > >>
>> > >>         pa.substrait.run_query(result_bytes, table_provider)
>> > >>
>> > >>
>> > >> I wonder if someone has tried integration between these two before
>> and
>> > >> can share some working code?
>> > >>
>> > >
>> >
>>
>

Reply via email to