(dolphinscheduler-sdk-python) branch main updated: impv: Add some docs for datax (#135)

zhongjiajie Sat, 13 Jan 2024 18:47:18 -0800

This is an automated email from the ASF dual-hosted git repository.

zhongjiajie pushed a commit to branch main
in repository 
https://gitbox.apache.org/repos/asf/dolphinscheduler-sdk-python.git



The following commit(s) were added to refs/heads/main by this push:
     new 714ab73  impv: Add some docs for datax (#135)
714ab73 is described below

commit 714ab7324e6bb67bdcb67b429330360d4fa96497
Author: Jay Chung <[email protected]>
AuthorDate: Sun Jan 14 10:47:09 2024 +0800

    impv: Add some docs for datax (#135)
    
    fix: #134
---
 .../examples/task_datax_example.py                 |  7 ++-
 src/pydolphinscheduler/tasks/datax.py              | 53 ++++++++++++++++++++--
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/src/pydolphinscheduler/examples/task_datax_example.py 
b/src/pydolphinscheduler/examples/task_datax_example.py
index d463ff0..cff9e21 100644
--- a/src/pydolphinscheduler/examples/task_datax_example.py
+++ b/src/pydolphinscheduler/examples/task_datax_example.py
@@ -25,6 +25,8 @@ You can create data sources `first_mysql` and `first_mysql` 
through UI.
 It creates a task to synchronize datax from the source database to the target 
database.
 """
 
+import json
+
 from pydolphinscheduler.core.workflow import Workflow
 from pydolphinscheduler.tasks.datax import CustomDataX, DataX
 
@@ -89,7 +91,10 @@ with Workflow(
 
     # You can custom json_template of datax to sync data. This task create a 
new
     # datax job same as task1, transfer record from `first_mysql` to 
`second_mysql`
-    task2 = CustomDataX(name="task_custom_datax", json=str(JSON_TEMPLATE))
+    # We should format the custom json config if we want to format it in web UI
+    task2 = CustomDataX(
+        name="task_custom_datax", json=json.dumps(JSON_TEMPLATE, indent=4)
+    )
 
     # [start resource_limit]
     resource_limit = DataX(
diff --git a/src/pydolphinscheduler/tasks/datax.py 
b/src/pydolphinscheduler/tasks/datax.py
index 148b4b2..59edfee 100644
--- a/src/pydolphinscheduler/tasks/datax.py
+++ b/src/pydolphinscheduler/tasks/datax.py
@@ -29,6 +29,38 @@ class CustomDataX(WorkerResourceMixin, Task):
     """Task CustomDatax object, declare behavior for custom DataX task to 
dolphinscheduler.
 
     You provider json template for DataX, it can synchronize data according to 
the template you provided.
+
+    :param name: task name for this task
+    :param json: json template string, or json file path for custom DataX 
task, :class:`CustomDataX` will not
+        format json template, you should format by yourself.
+
+          * Use config string directly instead of json file path
+            * should use :func:`json.dumps` to format it if your json template 
is dict
+
+            .. code-block:: python
+
+                import json
+
+                custom = CustomDataX(
+                    name="custom_datax",
+                    json=json.dumps({"job": {"content": [{"reader": {"name": 
"mysqlreader"}}]}}),
+                )
+
+            * or format it by manual if your json template is native str.
+          * Use json file path, the format it shows in web UI is depended on 
your json file content.
+
+            .. code-block:: python
+
+                import json
+
+                custom = CustomDataX(
+                    name="custom_datax",
+                    # web UI datax config will show as json file content
+                    json="/path/to/datax.json",
+                )
+
+    :param xms: jvm param about min memory for task datax running, default is 
1g
+    :param xmx: jvm param about max memory for task datax running, default is 
1g
     """
 
     CUSTOM_CONFIG = 1
@@ -68,12 +100,25 @@ class DataX(WorkerResourceMixin, Task):
     You provider datasource_name and datatarget_name contain connection 
information, it decisions which
     database type and database instance would synchronous data.
 
-    :param name: task name.
-    :param datasource_name: source database name for task datax to extract 
data.
-    :param datatarget_name: target database name for task datax to load data.
+    :param name: task name for this task
+    :param datasource_name: source database name for task datax to extract 
data, it must exist in
+        dolphinscheduler's datasource center otherwise task datax will raise 
exception.
+    :param datatarget_name: target database name for task datax to load data, 
it must exist in
+        dolphinscheduler's datasource center otherwise task datax will raise 
exception.
     :param sql: sql statement for task datax to extract data form source 
database.
     :param target_table: target table name for task datax to load data into 
target database.
-    :param datasource_type: source database type, dolphinscheduler use
+    :param datasource_type: source database type, dolphinscheduler use it to 
find :param:``datasource_name``
+        in datasource center.
+    :param datasource_type: target database type, dolphinscheduler use it to 
find :param:``datatarget_name``
+        in datasource center.
+    :param job_speed_byte: task datax job speed byte, default is 0. For more 
detail you can get from
+        :seealso: https://github.com/alibaba/DataX
+    :param job_speed_record: task datax job speed record, default is 1000. For 
more detail you can get from
+        :seealso: https://github.com/alibaba/DataX
+    :param pre_statements: task datax job pre statements, it will execute 
before task datax job start to load.
+        default is None.
+    :param post_statements: task datax job post statements, it will execute 
after task datax job finish load.
+        default is None.
     """
 
     CUSTOM_CONFIG = 0

(dolphinscheduler-sdk-python) branch main updated: impv: Add some docs for datax (#135)

Reply via email to