westonpace commented on code in PR #14024:
URL: https://github.com/apache/arrow/pull/14024#discussion_r969893499
##########
python/pyarrow/includes/libarrow_substrait.pxd:
##########
@@ -22,10 +22,24 @@ from libcpp.vector cimport vector as std_vector
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
+# ctypedef CResult[shared_ptr[CTable]] named_table_provider(const
std_vector[c_string]&)
Review Comment:
```suggestion
```
##########
cpp/src/arrow/compute/exec/exec_plan.cc:
##########
@@ -643,6 +643,10 @@ Declaration Declaration::Sequence(std::vector<Declaration>
decls) {
return out;
}
+bool Declaration::IsValid(ExecFactoryRegistry* registry) const {
+ return registry->GetFactory(this->factory_name).ok() && this->options !=
nullptr;
Review Comment:
```suggestion
return !this->factory_name.empty() && this->options != nullptr;
```
I think the earlier advice I provided was slightly off. I think, all we
need to do here is distinguish between "default-constructed invalid
declarations" and "properly constructed declarations". Actual validation (e.g.
hash-map lookup in the factory) doesn't really belong in this method as it
should be really fast.
##########
python/pyarrow/_substrait.pyx:
##########
@@ -17,22 +17,98 @@
# cython: language_level = 3
from cython.operator cimport dereference as deref
+from libcpp.vector cimport vector as std_vector
from pyarrow import Buffer
-from pyarrow.lib import frombytes
+from pyarrow.lib import frombytes, tobytes
from pyarrow.lib cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_substrait cimport *
-def run_query(plan):
+cdef CDeclaration _create_named_table_provider(dict named_args, const
std_vector[c_string]& names):
+ cdef:
+ c_string c_name
+ shared_ptr[CTable] c_in_table
+ shared_ptr[CTableSourceNodeOptions] c_tablesourceopts
+ shared_ptr[CExecNodeOptions] c_input_node_opts
+ vector[CDeclaration.Input] no_c_inputs
+
+ py_names = []
+ for i in range(names.size()):
+ c_name = names[i]
+ py_names.append(frombytes(c_name))
+
+ py_table = named_args["provider"](py_names)
+ c_in_table = pyarrow_unwrap_table(py_table)
+ c_tablesourceopts = make_shared[CTableSourceNodeOptions](c_in_table)
+ c_input_node_opts = static_pointer_cast[CExecNodeOptions,
CTableSourceNodeOptions](
+ c_tablesourceopts)
+ return CDeclaration(tobytes("table_source"),
+ no_c_inputs, c_input_node_opts)
+
+
+def run_query(plan, table_provider=None):
"""
Execute a Substrait plan and read the results as a RecordBatchReader.
Parameters
----------
plan : Buffer
The serialized Substrait plan to execute.
+ table_provider : object (optional)
+ A function to resolve any NamedTable relation to a table.
+ The function will receive a single argument which will be a list
+ of strings representing the table name and should return a
pyarrow.Table.
+
+ Returns
+ -------
+ RecordBatchReader
+ A reader containing the result of the executed query
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> from pyarrow.lib import tobytes
+ >>> import pyarrow.substrait as substrait
+ >>> test_table_1 = pa.Table.from_pydict({"x": [1, 2, 3]})
+ >>> test_table_2 = pa.Table.from_pydict({"x": [4, 5, 6]})
+ >>> def table_provider(names):
+ ... if not names:
+ ... raise Exception("No names provided")
+ ... elif names[0] == "t1":
+ ... return test_table_1
+ ... elif names[1] == "t2":
+ ... return test_table_2
+ ... else:
+ ... raise Exception("Unrecognized table name")
+ ...
+ >>> substrait_query = '''
+ ... {
+ ... "relations": [
+ ... {"rel": {
+ ... "read": {
+ ... "base_schema": {
+ ... "struct": {
+ ... "types": [
+ ... {"i64": {}}
+ ... ]
+ ... },
+ ... "names": [
+ ... "x"
+ ... ]
+ ... },
+ ... "namedTable": {
+ ... "names": ["t1"]
+ ... }
+ ... }
+ ... }}
+ ... ]
+ ... }
+ ... '''
+ >>> buf = pa._substrait._parse_json_plan(tobytes(substrait_query))
+ >>> pa.substrait.run_query_with_provider(buf, table_provider)
+ <pyarrow.lib.RecordBatchReader object at 0x100dcaf10>
Review Comment:
Do these examples get verified at build time? If so I don't think this line
`0x100dcaf10` will be very reliable. If not, I suppose it is ok, but it is odd
to see a random pointer value in docs. We could maybe just omit this line.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]