Github user Stibbons commented on a diff in the pull request: https://github.com/apache/spark/pull/13599#discussion_r160549913 --- Diff: python/pyspark/context.py --- @@ -1023,6 +1032,35 @@ def getConf(self): conf.setAll(self._conf.getAll()) return conf + def install_packages(self, packages, install_driver=True): + """ + install python packages on all executors and driver through pip. pip will be installed + by default no matter using native virtualenv or conda. So it is guaranteed that pip is + available if virtualenv is enabled. + :param packages: string for single package or a list of string for multiple packages + :param install_driver: whether to install packages in client + """ + if self._conf.get("spark.pyspark.virtualenv.enabled") != "true": + raise RuntimeError("install_packages can only use called when " + "spark.pyspark.virtualenv.enabled set as true") + if isinstance(packages, basestring): + packages = [packages] + # seems statusTracker.getExecutorInfos() will return driver + exeuctors, so -1 here. + num_executors = len(self._jsc.sc().statusTracker().getExecutorInfos()) - 1 + dummyRDD = self.parallelize(range(num_executors), num_executors) + + def _run_pip(packages, iterator): + import pip + pip.main(["install"] + packages) + + # run it in the main thread. Will do it in a separated thread after + # https://github.com/pypa/pip/issues/2553 is fixed + if install_driver: + _run_pip(packages, None) + + import functools + dummyRDD.foreachPartition(functools.partial(_run_pip, packages)) --- End diff -- what about making this feature experimental and so improving it gradually ?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org