Copilot commented on code in PR #2016:
URL: https://github.com/apache/apisix-website/pull/2016#discussion_r3059394849


##########
scripts/update-sitemap-loc.js:
##########
@@ -15,6 +15,53 @@ const sitemapXMLs = [
   ],
 ];
 
+/**
+ * URL patterns to exclude from the sitemap.
+ *
+ * Why:
+ * - Versioned doc URLs (e.g. /docs/apisix/3.14/) duplicate the latest
+ *   unversioned paths (e.g. /docs/apisix/) and bloat the sitemap.
+ *   Only the unversioned (latest) URLs should be indexed.
+ * - /docs/.../next/ pages are for unreleased development docs.
+ * - /search pages are blocked by robots.txt — keeping them in
+ *   the sitemap sends contradictory signals to crawlers.
+ * - /blog/tags/ and /blog/page/ are low-value aggregation/pagination
+ *   pages, also blocked by robots.txt.
+ */
+const excludePatterns = [
+  // Versioned docs: /docs/<project>/<version>/ where version is digits.digits
+  /\/docs\/[\w-]+\/\d+\.\d+\//,
+  // Development "next" docs
+  /\/docs\/[\w-]+\/next\//,
+  // Search pages (blocked by robots.txt)
+  /\/search\/?$/,
+  // Blog tag and pagination pages (blocked by robots.txt)
+  /\/blog\/tags\//,
+  /\/blog\/page\//,
+];

Review Comment:
   The versioned-docs and `next` patterns only match when there’s a trailing 
`/` after the version/`next` segment. If the sitemap ever contains URLs like 
`/docs/apisix/3.14` or `/docs/apisix/next` (no trailing slash), they will not 
be filtered. Update the patterns to also match end-of-string after the segment 
(e.g., `(?:/|$)`), so both forms are excluded.



##########
scripts/update-sitemap-loc.js:
##########
@@ -15,6 +15,53 @@ const sitemapXMLs = [
   ],
 ];
 
+/**
+ * URL patterns to exclude from the sitemap.
+ *
+ * Why:
+ * - Versioned doc URLs (e.g. /docs/apisix/3.14/) duplicate the latest
+ *   unversioned paths (e.g. /docs/apisix/) and bloat the sitemap.
+ *   Only the unversioned (latest) URLs should be indexed.
+ * - /docs/.../next/ pages are for unreleased development docs.
+ * - /search pages are blocked by robots.txt — keeping them in
+ *   the sitemap sends contradictory signals to crawlers.
+ * - /blog/tags/ and /blog/page/ are low-value aggregation/pagination
+ *   pages, also blocked by robots.txt.
+ */
+const excludePatterns = [
+  // Versioned docs: /docs/<project>/<version>/ where version is digits.digits
+  /\/docs\/[\w-]+\/\d+\.\d+\//,
+  // Development "next" docs
+  /\/docs\/[\w-]+\/next\//,
+  // Search pages (blocked by robots.txt)
+  /\/search\/?$/,
+  // Blog tag and pagination pages (blocked by robots.txt)
+  /\/blog\/tags\//,
+  /\/blog\/page\//,
+];
+
+/**
+ * Returns true if the URL should be excluded from the sitemap.
+ */
+function shouldExclude(url) {
+  return excludePatterns.some((pattern) => pattern.test(url));
+}
+
+/**
+ * Filter out excluded URLs from a sitemap object and return removal count.
+ */
+function filterSitemapUrls(sitemap) {
+  const urls = Array.isArray(sitemap.urlset.url)
+    ? sitemap.urlset.url
+    : [sitemap.urlset.url];
+  const before = urls.length;
+  sitemap.urlset.url = urls.filter((entry) => {
+    const loc = entry.loc && entry.loc._text;
+    return !loc || !shouldExclude(loc);

Review Comment:
   This currently *keeps* entries that don’t have a valid `loc` (`return !loc 
|| ...`). In a sitemap, entries without `loc` are invalid and should be removed 
to avoid generating a malformed sitemap. Consider changing the predicate to 
require `loc` and then apply the exclude filter (i.e., drop entries without 
`loc`).
   ```suggestion
       return Boolean(loc) && !shouldExclude(loc);
   ```



##########
scripts/update-sitemap-loc.js:
##########
@@ -42,6 +89,8 @@ const tasks = new Listr([
                 ...sitemaps[i].urlset.url,
               ];
             }
+            const removed = filterSitemapUrls(res);
+            console.log(`  Filtered out ${removed} URLs from ${group[0]}`);

Review Comment:
   Using `console.log` inside a Listr task can produce messy/duplicated output 
(especially in CI) and may interfere with Listr’s rendering. Prefer reporting 
via Listr’s mechanisms (e.g., task output) so logs are consistently formatted 
and don’t break progress rendering.



##########
website/static/robots.txt:
##########
@@ -2,6 +2,119 @@
 
 User-agent: *
 
+# Blog aggregation and pagination pages (low-value for indexing)
+Disallow: /blog/tags/
+Disallow: /zh/blog/tags/
+Disallow: /blog/page/
+Disallow: /zh/blog/page/
+
+# Search pages
+Disallow: /search
+Disallow: /zh/search
+
+# Versioned docs — only the unversioned (latest) paths should be indexed.
+# e.g. /docs/apisix/ is the latest; /docs/apisix/3.14/ is a duplicate.
+Disallow: /docs/apisix/3.10/
+Disallow: /docs/apisix/3.11/
+Disallow: /docs/apisix/3.12/
+Disallow: /docs/apisix/3.13/
+Disallow: /docs/apisix/3.14/
+Disallow: /docs/apisix/3.15/
+Disallow: /docs/apisix/next/
+Disallow: /docs/ingress-controller/3.10/
+Disallow: /docs/ingress-controller/3.11/
+Disallow: /docs/ingress-controller/3.12/
+Disallow: /docs/ingress-controller/3.13/
+Disallow: /docs/ingress-controller/3.14/
+Disallow: /docs/ingress-controller/3.15/
+Disallow: /docs/ingress-controller/next/
+Disallow: /docs/helm-chart/3.10/
+Disallow: /docs/helm-chart/3.11/
+Disallow: /docs/helm-chart/3.12/
+Disallow: /docs/helm-chart/3.13/
+Disallow: /docs/helm-chart/3.14/
+Disallow: /docs/helm-chart/3.15/
+Disallow: /docs/helm-chart/next/
+Disallow: /docs/docker/3.10/
+Disallow: /docs/docker/3.11/
+Disallow: /docs/docker/3.12/
+Disallow: /docs/docker/3.13/
+Disallow: /docs/docker/3.14/
+Disallow: /docs/docker/3.15/
+Disallow: /docs/docker/next/
+Disallow: /docs/java-plugin-runner/3.10/
+Disallow: /docs/java-plugin-runner/3.11/
+Disallow: /docs/java-plugin-runner/3.12/
+Disallow: /docs/java-plugin-runner/3.13/
+Disallow: /docs/java-plugin-runner/3.14/
+Disallow: /docs/java-plugin-runner/3.15/

Review Comment:
   Hard-coding every version for every project will require continual manual 
updates as versions/projects change, and it’s easy for robots.txt to drift from 
the sitemap filter logic. Consider generating these directives from a single 
source of truth (e.g., a build step that writes robots.txt from the same 
version list used in docs), or switching to a supported wildcard-based approach 
if your target crawlers allow it—so adding a new version doesn’t require 
editing dozens of lines.
   ```suggestion
   Disallow: /docs/apisix/3.*/
   Disallow: /docs/apisix/next/
   Disallow: /docs/ingress-controller/3.*/
   Disallow: /docs/ingress-controller/next/
   Disallow: /docs/helm-chart/3.*/
   Disallow: /docs/helm-chart/next/
   Disallow: /docs/docker/3.*/
   Disallow: /docs/docker/next/
   Disallow: /docs/java-plugin-runner/3.*/
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to