This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push: new d4b03ec53db [SPARK-44264][PYTHON][DOCS] Added Example to Deepspeed Distributor d4b03ec53db is described below commit d4b03ec53db98d237e00aa9e097ef69faa19b4b1 Author: Mathew Jacob <mathew.ja...@databricks.com> AuthorDate: Wed Jul 26 16:22:45 2023 +0900 [SPARK-44264][PYTHON][DOCS] Added Example to Deepspeed Distributor ### What changes were proposed in this pull request? Added examples to the docstring of using DeepspeedTorchDistributor ### Why are the changes needed? More concrete examples, allowing for a better understanding of feature. ### Does this PR introduce _any_ user-facing change? Yes, docs changes. ### How was this patch tested? make html Closes #42087 from mathewjacob1002/docs_deepspeed. Lead-authored-by: Mathew Jacob <mathew.ja...@databricks.com> Co-authored-by: Mathew Jacob <134338709+mathewjacob1...@users.noreply.github.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> (cherry picked from commit ac8fe83af2178d76a9e3df9fedf008ef26d8d044) Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../pyspark/ml/deepspeed/deepspeed_distributor.py | 50 ++++++++++++++++------ 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/python/pyspark/ml/deepspeed/deepspeed_distributor.py b/python/pyspark/ml/deepspeed/deepspeed_distributor.py index d6ae98de5e3..7c2b8c43526 100644 --- a/python/pyspark/ml/deepspeed/deepspeed_distributor.py +++ b/python/pyspark/ml/deepspeed/deepspeed_distributor.py @@ -35,11 +35,11 @@ class DeepspeedTorchDistributor(TorchDistributor): def __init__( self, - num_gpus: int = 1, + numGpus: int = 1, nnodes: int = 1, - local_mode: bool = True, - use_gpu: bool = True, - deepspeed_config: Optional[Union[str, Dict[str, Any]]] = None, + localMode: bool = True, + useGpu: bool = True, + deepspeedConfig: Optional[Union[str, Dict[str, Any]]] = None, ): """ This class is used to run deepspeed training workloads with spark clusters. @@ -49,25 +49,51 @@ class DeepspeedTorchDistributor(TorchDistributor): Parameters ---------- - num_gpus: int + numGpus: int The number of GPUs to use per node (analagous to num_gpus in deepspeed command). nnodes: int The number of nodes that should be used for the run. - local_mode: bool + localMode: bool Whether or not to run the training in a distributed fashion or just locally. - use_gpu: bool + useGpu: bool Boolean flag to determine whether to utilize gpus. - deepspeed_config: Union[Dict[str,Any], str] or None: + deepspeedConfig: Union[Dict[str,Any], str] or None: The configuration file to be used for launching the deepspeed application. If it's a dictionary containing the parameters, then we will create the file. If None, deepspeed will fall back to default parameters. + + Examples + -------- + Run Deepspeed training function on a single node + + >>> def train(learning_rate): + ... import deepspeed + ... # rest of training function + ... return model + >>> distributor = DeepspeedTorchDistributor( + ... numGpus=4, + ... nnodes=1, + ... useGpu=True, + ... localMode=True, + ... deepspeedConfig="path/to/config.json") + >>> output = distributor.run(train, 0.01) + + Run Deepspeed training function on multiple nodes + + >>> distributor = DeepspeedTorchDistributor( + ... numGpus=4, + ... nnodes=3, + ... useGpu=True, + ... localMode=False, + ... deepspeedConfig="path/to/config.json") + >>> output = distributor.run(train, 0.01) """ - num_processes = num_gpus * nnodes - self.deepspeed_config = deepspeed_config + num_processes = numGpus * nnodes + self.deepspeed_config = deepspeedConfig super().__init__( num_processes, - local_mode, - use_gpu, + localMode, + useGpu, _ssl_conf=DeepspeedTorchDistributor._DEEPSPEED_SSL_CONF, ) self.cleanup_deepspeed_conf = False --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org