Copilot commented on code in PR #39301:
URL: https://github.com/apache/superset/pull/39301#discussion_r3190475316
##########
superset/commands/dataset/importers/v1/utils.py:
##########
@@ -34,9 +37,32 @@
from superset.sql.parse import Table
from superset.utils import json
from superset.utils.core import get_user
+from superset.utils.network import is_safe_host
logger = logging.getLogger(__name__)
+
+class _ValidatingRedirectHandler(HTTPRedirectHandler):
+ """Re-validates the redirect target URL before following any HTTP redirect.
+
+ Prevents bypasses where an initial URL passes validation but a subsequent
+ redirect points to a disallowed destination.
+ """
+
+ def redirect_request(
+ self,
+ req: request.Request,
+ fp: Any,
+ code: int,
+ msg: str,
+ headers: Any,
+ newurl: str,
+ ) -> request.Request | None:
+ """Validate each redirect target before delegating to the parent
handler."""
+ validate_data_uri(newurl)
+ return super().redirect_request(req, fp, code, msg, headers, newurl)
Review Comment:
`newurl` provided by `HTTPRedirectHandler` may be a relative URL (e.g.
`"/download.csv"`). Validating the relative string directly will likely fail
the allowlist and can break legitimate redirects; it also doesn’t validate the
*actual* resolved target. Resolve the redirect against the request URL (e.g.,
via `urljoin(req.full_url, newurl)`) and validate the resulting absolute URL
instead.
##########
superset/db_engine_specs/databricks.py:
##########
@@ -393,7 +393,7 @@ def validate_parameters( # type: ignore
if not host:
return errors
- if not is_hostname_valid(host): # type: ignore
+ if not is_safe_host(host): # type: ignore
errors.append(
SupersetError(
message="The hostname provided can't be resolved.",
Review Comment:
`is_safe_host()` can fail not only when a hostname can’t be resolved, but
also when it resolves to a private/link-local/etc. address. This error message
is now misleading; consider updating it to reflect both cases (e.g., “Host
can’t be resolved or is not allowed”) or differentiating between resolution
failure vs. disallowed address space.
##########
superset/commands/dataset/importers/v1/utils.py:
##########
@@ -82,12 +108,27 @@ def get_dtype(df: pd.DataFrame, dataset: SqlaTable) ->
dict[str, VisitableType]:
def validate_data_uri(data_uri: str) -> None:
"""
- Validate that the data URI is configured on DATASET_IMPORT_ALLOWED_URLS
- has a valid URL.
+ Validate that the data URI is permitted for dataset import.
+
+ Local ``file://`` URIs are allowed only when the path is confined to the
+ bundled examples folder. All other URIs must match a pattern in
+ ``DATASET_IMPORT_ALLOWED_DATA_URLS`` *and* resolve to a publicly-routable
host.
- :param data_uri:
- :return:
+ :param data_uri: the URI to validate
+ :raises DatasetForbiddenDataURI: if the URI is not permitted
"""
+ if data_uri.startswith("file://"):
+ from superset.examples.helpers import get_examples_folder
+
+ # Strip the "file://" prefix to get the filesystem path.
+ file_path = data_uri[len("file://") :]
+ # Resolve symlinks and relative components before comparing.
+ real_path = os.path.realpath(file_path)
+ examples_folder = os.path.realpath(get_examples_folder())
+ if not real_path.startswith(examples_folder + os.sep):
+ raise DatasetForbiddenDataURI()
+ return
Review Comment:
Parsing `file://` URIs by string slicing is brittle and can mis-handle valid
forms like `file://localhost/...` and URL-encoded paths; it also doesn’t
explicitly reject non-local authorities (non-empty netloc). Use `urlparse()` to
validate the scheme/netloc and convert the URL path to a filesystem path (e.g.,
via `urllib.request.url2pathname(parsed.path)`), and explicitly reject any
non-empty netloc except possibly `localhost`.
##########
superset/db_engine_specs/couchbase.py:
##########
@@ -228,7 +228,7 @@ def validate_parameters(
if not host:
return errors
# host can be a connection string in case of couchbase cloud. So
Connection Check is not required in that case. # noqa: E501
- if not is_hostname_valid(host):
+ if not is_safe_host(host):
Review Comment:
The adjacent comment states the Couchbase “host” can be a connection string
for Couchbase Cloud, in which case a raw DNS resolution check on the entire
string will fail (and `is_safe_host()` will reject). If connection strings are
supported here, extract and validate only the hostname portion (or bypass this
check for connection-string forms) so Cloud configs aren’t inadvertently
blocked.
##########
superset/reports/notifications/webhook.py:
##########
@@ -93,6 +94,16 @@ def _get_files(self) -> list[tuple[str, tuple[str, bytes,
str]]]:
)
return files
+ def _validate_webhook_url(self, url: str) -> None:
+ parsed = urlparse(url)
+ if current_app.config["ALERT_REPORTS_WEBHOOK_HTTPS_ONLY"]:
+ if parsed.scheme.lower() != "https":
+ raise NotificationParamException(
+ "Webhook failed: HTTPS is required by config for webhook
URLs."
+ )
+ if parsed.hostname and not is_safe_host(parsed.hostname):
Review Comment:
`_validate_webhook_url` only validates `parsed.hostname` when it exists,
which means URLs without a hostname (e.g. malformed URLs or non-HTTP(S) schemes
like `file://...`) won’t be rejected here and will fail later in less
controlled ways. Consider explicitly enforcing allowed schemes (http/https) and
requiring a hostname/netloc so invalid or unsupported webhook URLs are rejected
with a clear `NotificationParamException`.
##########
superset/db_engine_specs/impala.py:
##########
@@ -209,6 +210,8 @@ def cancel_query(cls, cursor: Any, query: Query,
cancel_query_id: str) -> bool:
"""
try:
impala_host = query.database.url_object.host
+ if not impala_host or not is_safe_host(impala_host):
+ return False
url =
f"http://{impala_host}:25000/cancel_query?query_id={cancel_query_id}"
response = requests.post(url, timeout=3)
Review Comment:
`is_safe_host()` rejects RFC1918/private/link-local addresses, which can
disable `cancel_query()` for common Impala deployments running on internal
networks. If the intent is SSRF-hardening for user-controlled inputs, consider
scoping this to the highest-risk ranges (e.g. loopback/link-local/IMDS) or
gating private-network blocking behind a configuration flag so on-prem/internal
Impala setups can still cancel queries.
##########
superset/utils/network.py:
##########
@@ -14,14 +14,61 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
+import ipaddress
import platform
import socket
import subprocess
+# Networks that must never be reached via user-supplied hostnames.
+# Includes loopback, RFC-1918 private ranges, link-local (covers cloud
+# metadata endpoints such as 169.254.169.254), shared address space
+# (RFC 6598, 100.64.0.0/10), and IPv6 equivalents.
+_SSRF_UNSAFE_NETWORKS = (
+ ipaddress.ip_network("0.0.0.0/8"),
+ ipaddress.ip_network("10.0.0.0/8"),
+ ipaddress.ip_network("100.64.0.0/10"),
+ ipaddress.ip_network("127.0.0.0/8"),
+ ipaddress.ip_network("169.254.0.0/16"),
+ ipaddress.ip_network("172.16.0.0/12"),
+ ipaddress.ip_network("192.168.0.0/16"),
+ ipaddress.ip_network("::1/128"),
+ ipaddress.ip_network("fc00::/7"),
+ ipaddress.ip_network("fe80::/10"),
+)
+
PORT_TIMEOUT = 5
PING_TIMEOUT = 5
+def is_safe_host(host: str) -> bool:
+ """
+ Return True if ``host`` resolves exclusively to public, globally-routable
+ IP addresses.
+
+ Returns False if any resolved address falls within a private, loopback,
+ link-local, or otherwise non-routable range. An unresolvable host also
+ returns False.
+ """
+ try:
+ results = socket.getaddrinfo(host, None)
+ except socket.gaierror:
+ return False
+ if not results:
+ return False
+ for _, _, _, _, sockaddr in results:
+ try:
+ ip = ipaddress.ip_address(sockaddr[0])
+ except ValueError:
+ return False
+ # Unwrap IPv4-mapped IPv6 addresses (e.g. ::ffff:127.0.0.1) so they
+ # are checked against the IPv4 unsafe networks rather than bypassing.
+ if isinstance(ip, ipaddress.IPv6Address) and ip.ipv4_mapped:
+ ip = ip.ipv4_mapped
+ if any(ip in net for net in _SSRF_UNSAFE_NETWORKS):
Review Comment:
The docstring says “public, globally-routable” but the implementation only
blocks a subset of non-global ranges. This currently allows other
non-routable/special-use targets (e.g., multicast, reserved, documentation
ranges, unspecified `0.0.0.0` / `::`, etc.) which weakens SSRF protection.
Consider basing the decision on `ip.is_global` (fail closed if not global) and
keeping the explicit blocklist (or expanding it) for clarity.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]