This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new c39a82593c3 [SPARK-44965][PYTHON] Hide internal functions/variables from `pyspark.sql.functions` c39a82593c3 is described below commit c39a82593c3b85e507d6431966bc840ba8c06d60 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Tue Aug 29 09:29:22 2023 +0800 [SPARK-44965][PYTHON] Hide internal functions/variables from `pyspark.sql.functions` ### What changes were proposed in this pull request? Hide internal functions/variables from `pyspark.sql.functions` ### Why are the changes needed? internal functions/variables should not be exposed to end users: ``` Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /__ / .__/\_,_/_/ /_/\_\ version 3.4.1 /_/ Using Python version 3.10.12 (main, Jul 5 2023 15:02:25) Spark context Web UI available at http://localhost:4040/ Spark context available as 'sc' (master = local[*], app id = local-1692949938125). SparkSession available as 'spark'. In [1]: from pyspark.sql.functions import * In [2]: ??to_str Signature: to_str(value: Any) -> Optional[str] Source: def to_str(value: Any) -> Optional[str]: """ A wrapper over str(), but converts bool values to lower case strings. If None is given, just returns None, instead of converting it to string "None". """ if isinstance(value, bool): return str(value).lower() elif value is None: return value else: return str(value) File: ~/.dev/bin/spark-3.4.1-bin-hadoop3/python/pyspark/sql/utils.py Type: function ``` `to_str` here is a internal helper function ### Does this PR introduce _any_ user-facing change? yes ### How was this patch tested? CI ### Was this patch authored or co-authored using generative AI tooling? NO Closes #42680 from zhengruifeng/py_func_all. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/sql/functions.py | 430 +++++++++++++++++++++++++++++ python/pyspark/sql/tests/test_functions.py | 33 +++ 2 files changed, 463 insertions(+) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 5d5557cb916..43b82d31368 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -79,6 +79,436 @@ if has_numpy: # since it requires making every single overridden definition. +__all__ = [ + "abs", + "acos", + "acosh", + "add_months", + "aes_decrypt", + "aes_encrypt", + "aggregate", + "any_value", + "approxCountDistinct", + "approx_count_distinct", + "approx_percentile", + "array", + "array_agg", + "array_append", + "array_compact", + "array_contains", + "array_distinct", + "array_except", + "array_insert", + "array_intersect", + "array_join", + "array_max", + "array_min", + "array_position", + "array_prepend", + "array_remove", + "array_repeat", + "array_size", + "array_sort", + "array_union", + "arrays_overlap", + "arrays_zip", + "asc", + "asc_nulls_first", + "asc_nulls_last", + "ascii", + "asin", + "asinh", + "assert_true", + "atan", + "atan2", + "atanh", + "avg", + "base64", + "bin", + "bit_and", + "bit_count", + "bit_get", + "bit_length", + "bit_or", + "bit_xor", + "bitmap_bit_position", + "bitmap_bucket_number", + "bitmap_construct_agg", + "bitmap_count", + "bitmap_or_agg", + "bitwiseNOT", + "bitwise_not", + "bool_and", + "bool_or", + "broadcast", + "bround", + "btrim", + "bucket", + "call_function", + "call_udf", + "cardinality", + "cast", + "cbrt", + "ceil", + "ceiling", + "char", + "char_length", + "character_length", + "coalesce", + "col", + "collect_list", + "collect_set", + "column", + "concat", + "concat_ws", + "contains", + "conv", + "convert_timezone", + "corr", + "cos", + "cosh", + "cot", + "count", + "countDistinct", + "count_distinct", + "count_if", + "count_min_sketch", + "covar_pop", + "covar_samp", + "crc32", + "create_map", + "csc", + "cume_dist", + "curdate", + "current_catalog", + "current_database", + "current_date", + "current_schema", + "current_timestamp", + "current_timezone", + "current_user", + "date_add", + "date_diff", + "date_format", + "date_from_unix_date", + "date_part", + "date_sub", + "date_trunc", + "dateadd", + "datediff", + "datepart", + "day", + "dayofmonth", + "dayofweek", + "dayofyear", + "days", + "decode", + "degrees", + "dense_rank", + "desc", + "desc_nulls_first", + "desc_nulls_last", + "e", + "element_at", + "elt", + "encode", + "endswith", + "equal_null", + "every", + "exists", + "exp", + "explode", + "explode_outer", + "expm1", + "expr", + "extract", + "factorial", + "filter", + "find_in_set", + "first", + "first_value", + "flatten", + "floor", + "forall", + "format_number", + "format_string", + "from_csv", + "from_json", + "from_unixtime", + "from_utc_timestamp", + "get", + "get_json_object", + "getbit", + "greatest", + "grouping", + "grouping_id", + "hash", + "hex", + "histogram_numeric", + "hll_sketch_agg", + "hll_sketch_estimate", + "hll_union", + "hll_union_agg", + "hour", + "hours", + "hypot", + "ifnull", + "ilike", + "initcap", + "inline", + "inline_outer", + "input_file_block_length", + "input_file_block_start", + "input_file_name", + "instr", + "isnan", + "isnotnull", + "isnull", + "java_method", + "json_array_length", + "json_object_keys", + "json_tuple", + "kurtosis", + "lag", + "last", + "last_day", + "last_value", + "lcase", + "lead", + "least", + "left", + "length", + "levenshtein", + "like", + "lit", + "ln", + "localtimestamp", + "locate", + "log", + "log10", + "log1p", + "log2", + "lower", + "lpad", + "ltrim", + "make_date", + "make_dt_interval", + "make_interval", + "make_timestamp", + "make_timestamp_ltz", + "make_timestamp_ntz", + "make_ym_interval", + "map_concat", + "map_contains_key", + "map_entries", + "map_filter", + "map_from_arrays", + "map_from_entries", + "map_keys", + "map_values", + "map_zip_with", + "mask", + "max", + "max_by", + "md5", + "mean", + "median", + "min", + "min_by", + "minute", + "mode", + "monotonically_increasing_id", + "month", + "months", + "months_between", + "named_struct", + "nanvl", + "negate", + "negative", + "next_day", + "now", + "nth_value", + "ntile", + "nullif", + "nvl", + "nvl2", + "octet_length", + "overlay", + "overload", + "parse_url", + "percent_rank", + "percentile", + "percentile_approx", + "pi", + "pmod", + "posexplode", + "posexplode_outer", + "position", + "positive", + "pow", + "power", + "printf", + "product", + "quarter", + "radians", + "raise_error", + "rand", + "randn", + "rank", + "reduce", + "reflect", + "regexp", + "regexp_count", + "regexp_extract", + "regexp_extract_all", + "regexp_instr", + "regexp_like", + "regexp_replace", + "regexp_substr", + "regr_avgx", + "regr_avgy", + "regr_count", + "regr_intercept", + "regr_r2", + "regr_slope", + "regr_sxx", + "regr_sxy", + "regr_syy", + "repeat", + "replace", + "reverse", + "right", + "rint", + "rlike", + "round", + "row_number", + "rpad", + "rtrim", + "schema_of_csv", + "schema_of_json", + "sec", + "second", + "sentences", + "sequence", + "session_window", + "sha", + "sha1", + "sha2", + "shiftLeft", + "shiftRight", + "shiftRightUnsigned", + "shiftleft", + "shiftright", + "shiftrightunsigned", + "shuffle", + "sign", + "signum", + "sin", + "sinh", + "size", + "skewness", + "slice", + "some", + "sort_array", + "soundex", + "spark_partition_id", + "split", + "split_part", + "sqrt", + "stack", + "startswith", + "std", + "stddev", + "stddev_pop", + "stddev_samp", + "str_to_map", + "struct", + "substr", + "substring", + "substring_index", + "sum", + "sumDistinct", + "sum_distinct", + "tan", + "tanh", + "timestamp_micros", + "timestamp_millis", + "timestamp_seconds", + "toDegrees", + "toRadians", + "to_binary", + "to_char", + "to_csv", + "to_date", + "to_json", + "to_number", + "to_timestamp", + "to_timestamp_ltz", + "to_timestamp_ntz", + "to_unix_timestamp", + "to_utc_timestamp", + "to_varchar", + "transform", + "transform_keys", + "transform_values", + "translate", + "trim", + "trunc", + "try_add", + "try_aes_decrypt", + "try_avg", + "try_divide", + "try_element_at", + "try_multiply", + "try_subtract", + "try_sum", + "try_to_binary", + "try_to_number", + "try_to_timestamp", + "typeof", + "ucase", + "udf", + "udtf", + "unbase64", + "unhex", + "unix_date", + "unix_micros", + "unix_millis", + "unix_seconds", + "unix_timestamp", + "unwrap_udt", + "upper", + "url_decode", + "url_encode", + "user", + "var_pop", + "var_samp", + "variance", + "version", + "weekday", + "weekofyear", + "when", + "width_bucket", + "window", + "window_time", + "xpath", + "xpath_boolean", + "xpath_double", + "xpath_float", + "xpath_int", + "xpath_long", + "xpath_number", + "xpath_short", + "xpath_string", + "xxhash64", + "year", + "years", + "zip_with", + "pandas_udf", + "PandasUDFType", +] + + def _get_jvm_function(name: str, sc: SparkContext) -> Callable: """ Retrieves JVM function identified by name from diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index c484e10ec1a..0633b8c4341 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -92,6 +92,39 @@ class FunctionsTestsMixin: expected_missing_in_py, missing_in_py, "Missing functions in pyspark not as expected" ) + def test_public_function(self): + inspected_list = {name for (name, value) in getmembers(F, isfunction) if name[0] != "_"} + + public_list = set(F.__all__) + + # check alias: both function 'pow' and its alias 'power' should be included + self.assertTrue("pow" in inspected_list) + self.assertTrue("power" in inspected_list) + self.assertTrue("pow" in public_list) + self.assertTrue("power" in public_list) + + inspected_execuded_list = { + "get_active_spark_context", # internal helper function + "try_remote_functions", # internal helper function + "to_str", # internal helper function + } + + self.assertEqual( + inspected_list - public_list, + inspected_execuded_list, + "Inspected functions NOT exposed!", + ) + + public_execuded_list = { + "PandasUDFType", # type, not a function + } + + self.assertEqual( + public_list - inspected_list, + public_execuded_list, + "Non-existent functions exposed!", + ) + def test_explode(self): d = [ Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"}), --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org