This is an automated email from the ASF dual-hosted git repository.
kaxilnaik pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new fab5402def1 Fix connection docs URLs to use Sphinx inventory instead
of hardcoded paths (#63349)
fab5402def1 is described below
commit fab5402def16ceba18ac94d8f25704fef725d9f7
Author: Kaxil Naik <[email protected]>
AuthorDate: Wed Mar 11 14:04:32 2026 +0000
Fix connection docs URLs to use Sphinx inventory instead of hardcoded paths
(#63349)
The registry was hardcoding `connections/index.html` for all connection
type docs links, which 404s for providers like Tableau (correct URL is
`connections/tableau.html`) and Google BigQuery
(`connections/bigquery.html`).
Now `extract_metadata.py` and `extract_versions.py` parse the provider's
Sphinx `objects.inv` to resolve per-connection-type URLs from `std:label`
and `std:doc` entries. Unresolved conn_types fall back to `connections/`.
Also fixes `connection-builder.js` which was mangling the docs URL by
replacing `index.html` with `{connType}.html` — producing URLs like
`connections/gcpbigquery.html` (404) instead of using the data as-is.
---
dev/registry/extract_metadata.py | 71 +++++++++++--
dev/registry/extract_versions.py | 13 ++-
dev/registry/tests/test_extract_metadata.py | 149 ++++++++++++++++++++++++++++
registry/src/js/connection-builder.js | 9 +-
4 files changed, 224 insertions(+), 18 deletions(-)
diff --git a/dev/registry/extract_metadata.py b/dev/registry/extract_metadata.py
index 1c5047c08e1..25149e39977 100644
--- a/dev/registry/extract_metadata.py
+++ b/dev/registry/extract_metadata.py
@@ -49,9 +49,6 @@ PYPISTATS_RECENT_URL =
"https://pypistats.org/api/packages/{package_name}/recent
PYPI_PACKAGE_JSON_URL = "https://pypi.org/pypi/{package_name}/json"
S3_DOC_URL = "http://apache-airflow-docs.s3-website.eu-central-1.amazonaws.com"
AIRFLOW_PROVIDER_DOCS_URL =
"https://airflow.apache.org/docs/{package_name}/stable/"
-AIRFLOW_PROVIDER_CONNECTIONS_URL = (
-
"https://airflow.apache.org/docs/{package_name}/stable/connections/index.html"
-)
AIRFLOW_PROVIDER_SOURCE_URL = (
"https://github.com/apache/airflow/tree/providers-{provider_id}/{version}/providers/{provider_path}"
)
@@ -103,15 +100,18 @@ def fetch_pypi_dates(package_name: str) -> dict[str, str]:
return {"first_released": "", "last_updated": ""}
-def read_inventory(inv_path: Path) -> dict[str, str]:
- """Parse a Sphinx objects.inv file and return {qualified_name: url_path}
for py:class entries."""
+def _parse_inventory_lines(inv_path: Path) -> list[str]:
+ """Read and decompress the body of a Sphinx objects.inv file."""
with inv_path.open("rb") as f:
- # Skip the 4 header lines
for _ in range(4):
f.readline()
- data = zlib.decompress(f.read()).decode("utf-8").splitlines()
+ return zlib.decompress(f.read()).decode("utf-8").splitlines()
+
+
+def read_inventory(inv_path: Path) -> dict[str, str]:
+ """Parse a Sphinx objects.inv file and return {qualified_name: url_path}
for py:class entries."""
result: dict[str, str] = {}
- for line in data:
+ for line in _parse_inventory_lines(inv_path):
parts = line.split(None, 4)
if len(parts) != 5:
continue
@@ -122,6 +122,39 @@ def read_inventory(inv_path: Path) -> dict[str, str]:
return result
+def read_connection_urls(inv_path: Path) -> dict[str, str]:
+ """Parse a Sphinx objects.inv and return {conn_type: relative_url} for
connection pages.
+
+ Uses two inventory entry types:
+ - ``std:label howto/connection:{conn_type}`` — maps conn_type directly to
a page
+ - ``std:doc connections/{name}`` — fallback by matching conn_type to doc
name
+ """
+ label_map: dict[str, str] = {} # conn_type -> page URL (from std:label)
+ doc_map: dict[str, str] = {} # doc_name -> page URL (from std:doc)
+ for line in _parse_inventory_lines(inv_path):
+ parts = line.split(None, 4)
+ if len(parts) != 5:
+ continue
+ name, domain_role, _prio, location, _dispname = parts
+ if domain_role == "std:label" and name.startswith("howto/connection:"):
+ label_key = name[len("howto/connection:") :]
+ # Skip sub-section labels like "gcp:configuring_the_connection"
+ if ":" not in label_key:
+ label_map[label_key] = location.split("#")[0]
+ elif domain_role == "std:doc" and name.startswith("connections/"):
+ doc_name = name[len("connections/") :]
+ if doc_name != "index":
+ doc_map[doc_name] = location
+
+ # Merge: label_map takes precedence, doc_map fills gaps
+ result: dict[str, str] = {}
+ result.update(label_map)
+ for doc_name, url in doc_map.items():
+ if doc_name not in result:
+ result[doc_name] = url
+ return result
+
+
INVENTORY_CACHE_DIR = Path(__file__).parent / ".inventory_cache"
INVENTORY_TTL = datetime.timedelta(hours=12)
@@ -160,6 +193,18 @@ def fetch_provider_inventory(package_name: str, cache_dir:
Path = INVENTORY_CACH
return None
+def resolve_connection_docs_url(conn_type: str, conn_url_map: dict[str, str],
base_docs_url: str) -> str:
+ """Resolve the docs URL for a connection type using the inventory map.
+
+ Lookup order:
+ 1. Exact match on conn_type in the inventory map
+ 2. Fallback to connections/ directory listing
+ """
+ if conn_type in conn_url_map:
+ return f"{base_docs_url}/{conn_url_map[conn_type]}"
+ return f"{base_docs_url}/connections/"
+
+
# Base paths
AIRFLOW_ROOT = Path(__file__).parent.parent.parent
SCRIPT_DIR = Path(__file__).parent
@@ -495,9 +540,13 @@ def main():
shutil.copy2(src, registry_logos_dir / logo_filename)
# Extract connection types from provider.yaml
- # Link to the connections index page since individual connection pages
might not exist
+ # Resolve per-connection docs URLs from Sphinx inventory when available
connection_types = []
- connections_index_url =
AIRFLOW_PROVIDER_CONNECTIONS_URL.format(package_name=package_name)
+ base_docs_url =
AIRFLOW_PROVIDER_DOCS_URL.format(package_name=package_name).rstrip("/")
+ conn_url_map: dict[str, str] = {}
+ inv_path = fetch_provider_inventory(package_name)
+ if inv_path:
+ conn_url_map = read_connection_urls(inv_path)
for conn in provider_yaml.get("connection-types", []):
conn_type = conn.get("connection-type", "")
hook_class = conn.get("hook-class-name", "")
@@ -506,7 +555,7 @@ def main():
{
"conn_type": conn_type,
"hook_class": hook_class,
- "docs_url": connections_index_url,
+ "docs_url": resolve_connection_docs_url(conn_type,
conn_url_map, base_docs_url),
}
)
diff --git a/dev/registry/extract_versions.py b/dev/registry/extract_versions.py
index d52a31b5bf7..38257f19070 100644
--- a/dev/registry/extract_versions.py
+++ b/dev/registry/extract_versions.py
@@ -54,6 +54,7 @@ except ImportError:
print("ERROR: PyYAML required. Install with: pip install pyyaml")
sys.exit(1)
+from extract_metadata import fetch_provider_inventory, read_connection_urls,
resolve_connection_docs_url
from registry_tools.types import MODULE_LEVEL_SECTIONS, TYPE_SUFFIXES
AIRFLOW_ROOT = Path(__file__).parent.parent.parent
@@ -366,13 +367,21 @@ def extract_version_data(
if layout == "old" and not pyproject_data["dependencies"]:
pyproject_data["dependencies"] = provider_yaml.get("dependencies", [])
- # Connection types
+ # Connection types — resolve per-conn_type docs URLs from Sphinx inventory
+ package_name = provider_yaml.get("package-name",
f"apache-airflow-providers-{provider_id}")
+ base_docs_url = f"https://airflow.apache.org/docs/{package_name}/stable"
+ conn_url_map: dict[str, str] = {}
+ inv_path = fetch_provider_inventory(package_name)
+ if inv_path:
+ conn_url_map = read_connection_urls(inv_path)
connection_types = []
for ct in provider_yaml.get("connection-types", []):
+ conn_type = ct.get("connection-type", "")
connection_types.append(
{
- "conn_type": ct.get("connection-type", ""),
+ "conn_type": conn_type,
"hook_class": ct.get("hook-class-name", ""),
+ "docs_url": resolve_connection_docs_url(conn_type,
conn_url_map, base_docs_url),
}
)
diff --git a/dev/registry/tests/test_extract_metadata.py
b/dev/registry/tests/test_extract_metadata.py
index 138b41daae4..5bbccd7cb9b 100644
--- a/dev/registry/tests/test_extract_metadata.py
+++ b/dev/registry/tests/test_extract_metadata.py
@@ -34,7 +34,9 @@ from extract_metadata import (
find_related_providers,
module_path_to_file_path,
parse_pyproject_toml,
+ read_connection_urls,
read_inventory,
+ resolve_connection_docs_url,
)
@@ -419,3 +421,150 @@ class TestFetchProviderInventory:
result = fetch_provider_inventory("apache-airflow-providers-amazon",
cache_dir=cache_dir)
assert result is not None
assert result.read_bytes() == new_content
+
+
+# ---------------------------------------------------------------------------
+# read_connection_urls
+# ---------------------------------------------------------------------------
+class TestReadConnectionUrls:
+ @staticmethod
+ def _make_inventory(tmp_path: Path, entries: list[str]) -> Path:
+ import zlib
+
+ inv_path = tmp_path / "objects.inv"
+ header = (
+ b"# Sphinx inventory version 2\n"
+ b"# Project: test\n"
+ b"# Version: 1.0\n"
+ b"# The remainder of this file is compressed using zlib.\n"
+ )
+ body = "\n".join(entries).encode("utf-8")
+ with inv_path.open("wb") as f:
+ f.write(header)
+ f.write(zlib.compress(body))
+ return inv_path
+
+ def test_parses_std_label_entries(self, tmp_path):
+ inv_path = self._make_inventory(
+ tmp_path,
+ [
+ "howto/connection:kubernetes std:label -1
connections/kubernetes.html#howto-connection-kubernetes Kubernetes cluster
Connection",
+ ],
+ )
+ result = read_connection_urls(inv_path)
+ assert result == {"kubernetes": "connections/kubernetes.html"}
+
+ def test_parses_std_doc_entries(self, tmp_path):
+ inv_path = self._make_inventory(
+ tmp_path,
+ [
+ "connections/tableau std:doc -1 connections/tableau.html
Tableau Connection",
+ ],
+ )
+ result = read_connection_urls(inv_path)
+ assert result == {"tableau": "connections/tableau.html"}
+
+ def test_label_takes_precedence_over_doc(self, tmp_path):
+ """When both std:label and std:doc exist for the same key, label
wins."""
+ inv_path = self._make_inventory(
+ tmp_path,
+ [
+ "howto/connection:aws std:label -1
connections/aws.html#howto-connection-aws AWS Connection",
+ "connections/aws std:doc -1 connections/aws.html AWS
Connection",
+ ],
+ )
+ result = read_connection_urls(inv_path)
+ assert result["aws"] == "connections/aws.html"
+
+ def test_skips_sub_section_labels(self, tmp_path):
+ """Labels like howto/connection:gcp:configuring_the_connection are
sub-sections, not top-level."""
+ inv_path = self._make_inventory(
+ tmp_path,
+ [
+ "howto/connection:gcp std:label -1
connections/gcp.html#howto-connection-gcp GCP Connection",
+ "howto/connection:gcp:configuring_the_connection std:label -1
connections/gcp.html#sub Configuring",
+ ],
+ )
+ result = read_connection_urls(inv_path)
+ assert result == {"gcp": "connections/gcp.html"}
+
+ def test_skips_connections_index(self, tmp_path):
+ """The connections/index doc should not appear in the map."""
+ inv_path = self._make_inventory(
+ tmp_path,
+ [
+ "connections/index std:doc -1 connections/index.html
Connection Types",
+ "connections/kafka std:doc -1 connections/kafka.html Kafka
Connection",
+ ],
+ )
+ result = read_connection_urls(inv_path)
+ assert "index" not in result
+ assert result == {"kafka": "connections/kafka.html"}
+
+ def test_ignores_unrelated_entries(self, tmp_path):
+ inv_path = self._make_inventory(
+ tmp_path,
+ [
+ "airflow.providers.amazon.hooks.s3.S3Hook py:class 1
api.html#$ -",
+ "some_module py:module 1 mod.html -",
+ ],
+ )
+ result = read_connection_urls(inv_path)
+ assert result == {}
+
+ def test_empty_inventory(self, tmp_path):
+ inv_path = self._make_inventory(tmp_path, [])
+ result = read_connection_urls(inv_path)
+ assert result == {}
+
+ def test_multiple_connection_types(self, tmp_path):
+ """Amazon-style provider with multiple connection pages."""
+ inv_path = self._make_inventory(
+ tmp_path,
+ [
+ "howto/connection:aws std:label -1
connections/aws.html#howto-connection-aws AWS",
+ "howto/connection:emr std:label -1
connections/emr.html#howto-connection-emr EMR",
+ "howto/connection:redshift std:label -1
connections/redshift.html#howto-connection-redshift Redshift",
+ "connections/athena std:doc -1 connections/athena.html Athena",
+ ],
+ )
+ result = read_connection_urls(inv_path)
+ assert result["aws"] == "connections/aws.html"
+ assert result["emr"] == "connections/emr.html"
+ assert result["redshift"] == "connections/redshift.html"
+ assert result["athena"] == "connections/athena.html"
+
+
+# ---------------------------------------------------------------------------
+# resolve_connection_docs_url
+# ---------------------------------------------------------------------------
+class TestResolveConnectionDocsUrl:
+ BASE =
"https://airflow.apache.org/docs/apache-airflow-providers-google/stable"
+
+ def test_exact_match(self):
+ conn_map = {"kubernetes": "connections/kubernetes.html"}
+ url = resolve_connection_docs_url("kubernetes", conn_map, self.BASE)
+ assert url == f"{self.BASE}/connections/kubernetes.html"
+
+ def test_fallback_to_connections_dir(self):
+ conn_map = {"kubernetes": "connections/kubernetes.html"}
+ url = resolve_connection_docs_url("unknown_type", conn_map, self.BASE)
+ assert url == f"{self.BASE}/connections/"
+
+ def test_empty_map_falls_back_to_connections_dir(self):
+ url = resolve_connection_docs_url("aws", {}, self.BASE)
+ assert url == f"{self.BASE}/connections/"
+
+ def test_google_bigquery_resolves(self):
+ """gcpbigquery conn_type should resolve to bigquery.html, not index."""
+ conn_map = {
+ "gcp": "connections/gcp.html",
+ "gcpbigquery": "connections/bigquery.html",
+ }
+ url = resolve_connection_docs_url("gcpbigquery", conn_map, self.BASE)
+ assert url == f"{self.BASE}/connections/bigquery.html"
+
+ def test_tableau_resolves(self):
+ conn_map = {"tableau": "connections/tableau.html"}
+ url = resolve_connection_docs_url("tableau", conn_map, self.BASE)
+ assert url == f"{self.BASE}/connections/tableau.html"
diff --git a/registry/src/js/connection-builder.js
b/registry/src/js/connection-builder.js
index a316a99cb56..3228201f622 100644
--- a/registry/src/js/connection-builder.js
+++ b/registry/src/js/connection-builder.js
@@ -68,12 +68,11 @@
// Set title
titleEl.textContent = connType;
- // Show/hide docs link (derive per-connection-type URL)
+ // Show/hide docs link (URL is resolved per-connection-type by extract
scripts)
if (docsLink) {
- var baseDocsUrl = chip.dataset.docsUrl;
- if (baseDocsUrl) {
- var perTypeUrl = baseDocsUrl.replace(/index\.html$/, connType +
".html");
- docsLink.href = perTypeUrl;
+ var docsUrl = chip.dataset.docsUrl;
+ if (docsUrl) {
+ docsLink.href = docsUrl;
docsLink.hidden = false;
} else {
docsLink.hidden = true;