This is an automated email from the ASF dual-hosted git repository.
zhongjiajie pushed a commit to branch main
in repository
https://gitbox.apache.org/repos/asf/dolphinscheduler-sdk-python.git
The following commit(s) were added to refs/heads/main by this push:
new 714ab73 impv: Add some docs for datax (#135)
714ab73 is described below
commit 714ab7324e6bb67bdcb67b429330360d4fa96497
Author: Jay Chung <[email protected]>
AuthorDate: Sun Jan 14 10:47:09 2024 +0800
impv: Add some docs for datax (#135)
fix: #134
---
.../examples/task_datax_example.py | 7 ++-
src/pydolphinscheduler/tasks/datax.py | 53 ++++++++++++++++++++--
2 files changed, 55 insertions(+), 5 deletions(-)
diff --git a/src/pydolphinscheduler/examples/task_datax_example.py
b/src/pydolphinscheduler/examples/task_datax_example.py
index d463ff0..cff9e21 100644
--- a/src/pydolphinscheduler/examples/task_datax_example.py
+++ b/src/pydolphinscheduler/examples/task_datax_example.py
@@ -25,6 +25,8 @@ You can create data sources `first_mysql` and `first_mysql`
through UI.
It creates a task to synchronize datax from the source database to the target
database.
"""
+import json
+
from pydolphinscheduler.core.workflow import Workflow
from pydolphinscheduler.tasks.datax import CustomDataX, DataX
@@ -89,7 +91,10 @@ with Workflow(
# You can custom json_template of datax to sync data. This task create a
new
# datax job same as task1, transfer record from `first_mysql` to
`second_mysql`
- task2 = CustomDataX(name="task_custom_datax", json=str(JSON_TEMPLATE))
+ # We should format the custom json config if we want to format it in web UI
+ task2 = CustomDataX(
+ name="task_custom_datax", json=json.dumps(JSON_TEMPLATE, indent=4)
+ )
# [start resource_limit]
resource_limit = DataX(
diff --git a/src/pydolphinscheduler/tasks/datax.py
b/src/pydolphinscheduler/tasks/datax.py
index 148b4b2..59edfee 100644
--- a/src/pydolphinscheduler/tasks/datax.py
+++ b/src/pydolphinscheduler/tasks/datax.py
@@ -29,6 +29,38 @@ class CustomDataX(WorkerResourceMixin, Task):
"""Task CustomDatax object, declare behavior for custom DataX task to
dolphinscheduler.
You provider json template for DataX, it can synchronize data according to
the template you provided.
+
+ :param name: task name for this task
+ :param json: json template string, or json file path for custom DataX
task, :class:`CustomDataX` will not
+ format json template, you should format by yourself.
+
+ * Use config string directly instead of json file path
+ * should use :func:`json.dumps` to format it if your json template
is dict
+
+ .. code-block:: python
+
+ import json
+
+ custom = CustomDataX(
+ name="custom_datax",
+ json=json.dumps({"job": {"content": [{"reader": {"name":
"mysqlreader"}}]}}),
+ )
+
+ * or format it by manual if your json template is native str.
+ * Use json file path, the format it shows in web UI is depended on
your json file content.
+
+ .. code-block:: python
+
+ import json
+
+ custom = CustomDataX(
+ name="custom_datax",
+ # web UI datax config will show as json file content
+ json="/path/to/datax.json",
+ )
+
+ :param xms: jvm param about min memory for task datax running, default is
1g
+ :param xmx: jvm param about max memory for task datax running, default is
1g
"""
CUSTOM_CONFIG = 1
@@ -68,12 +100,25 @@ class DataX(WorkerResourceMixin, Task):
You provider datasource_name and datatarget_name contain connection
information, it decisions which
database type and database instance would synchronous data.
- :param name: task name.
- :param datasource_name: source database name for task datax to extract
data.
- :param datatarget_name: target database name for task datax to load data.
+ :param name: task name for this task
+ :param datasource_name: source database name for task datax to extract
data, it must exist in
+ dolphinscheduler's datasource center otherwise task datax will raise
exception.
+ :param datatarget_name: target database name for task datax to load data,
it must exist in
+ dolphinscheduler's datasource center otherwise task datax will raise
exception.
:param sql: sql statement for task datax to extract data form source
database.
:param target_table: target table name for task datax to load data into
target database.
- :param datasource_type: source database type, dolphinscheduler use
+ :param datasource_type: source database type, dolphinscheduler use it to
find :param:``datasource_name``
+ in datasource center.
+ :param datasource_type: target database type, dolphinscheduler use it to
find :param:``datatarget_name``
+ in datasource center.
+ :param job_speed_byte: task datax job speed byte, default is 0. For more
detail you can get from
+ :seealso: https://github.com/alibaba/DataX
+ :param job_speed_record: task datax job speed record, default is 1000. For
more detail you can get from
+ :seealso: https://github.com/alibaba/DataX
+ :param pre_statements: task datax job pre statements, it will execute
before task datax job start to load.
+ default is None.
+ :param post_statements: task datax job post statements, it will execute
after task datax job finish load.
+ default is None.
"""
CUSTOM_CONFIG = 0