This is an automated email from the ASF dual-hosted git repository.

janhoy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new 3a2d2e5ce6e SOLR-17342 script to update old ref-guides (#3280)
3a2d2e5ce6e is described below

commit 3a2d2e5ce6e2f735eda889e829d14a83252fddd2
Author: Jan Høydahl <[email protected]>
AuthorDate: Fri Mar 21 09:15:04 2025 +0100

    SOLR-17342 script to update old ref-guides (#3280)
---
 .../scripts/refguide/refguide-download-js-css.py   | 156 +++++++++++++++++++++
 dev-tools/scripts/requirements.txt                 |   4 +-
 2 files changed, 159 insertions(+), 1 deletion(-)

diff --git a/dev-tools/scripts/refguide/refguide-download-js-css.py 
b/dev-tools/scripts/refguide/refguide-download-js-css.py
new file mode 100755
index 00000000000..ec3e0b1a8a5
--- /dev/null
+++ b/dev-tools/scripts/refguide/refguide-download-js-css.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script processes all static html files for Solr's refernce guide
+and downloads external JS and CSS files to local folders js/ and css/ for
+each version. It also updates the HTML files to reference the local files.
+Context is that ASF policy for web sites changed to not allow external
+references to JS and CSS files, and these sites were generated long ago.
+"""
+
+import os
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+import re
+import argparse
+
+def extract_version_from_url(url):
+    """Extract version number from URL if present."""
+    match = re.search(r'/(\d+\.\d+(\.\d+)?)/', url)
+    return match.group(1) if match else None
+
+def is_external_url(url):
+    if "apache.org" in url:
+        return False
+    """Check if a URL is external (starts with http/https or //)."""
+    return url.startswith("http://";) or url.startswith("https://";) or 
url.startswith("//")
+
+def download_file(url, dest_path):
+    """Download a file from a URL to a local path."""
+    if os.path.exists(dest_path):
+        #print(f"Skipping {url} (already downloaded to {dest_path})")
+        return
+    try:
+        if url.startswith("//"):
+            url = "https:" + url  # Default to HTTPS for protocol-relative URLs
+        if url.startswith("https://oss.maxcdn.com/";):
+            url = url.replace("https://oss.maxcdn.com/";, 
"https://cdnjs.cloudflare.com/ajax/";)
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        with open(dest_path, "wb") as f:
+            f.write(response.content)
+        print(f"Downloaded {url} to {dest_path}")
+    except Exception as e:
+        print(f"Failed to download {url}: {e}")
+
+def add_version_to_filename(filename, version):
+    """Add version number to filename if not already present.
+       Example: jquery.js -> jquery-3.6.0.js
+                jquery.min.js -> jquery-3.6.0.min.js
+    """
+    if filename.endswith(".min.js"):
+        filename_parts = filename.rsplit(".min.js", 1)
+        filename = f"{filename_parts[0]}-{version}.min.js"
+    elif filename.endswith(".min.css"):
+        filename_parts = filename.rsplit(".min.css", 1)
+        filename = f"{filename_parts[0]}-{version}.min.css"
+    else:
+        filename_parts = filename.rsplit('.', 1)
+        filename = f"{filename_parts[0]}-{version}.{filename_parts[1]}"
+    return filename
+
+def process_html_file(html_file_path, js_dir, css_dir, skip_files=None):
+    """Process an HTML file to localize external JS and CSS references."""
+    with open(html_file_path, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+
+    modified = False
+    new_lines = []
+
+    for line in lines:
+        soup = BeautifulSoup(line, "html.parser")
+        script = soup.find("script", src=True)
+        link = soup.find("link", rel="stylesheet", href=True)
+
+        if script and is_external_url(script["src"]):
+            src = script["src"]
+            filename = os.path.basename(urlparse(src).path)
+            version = extract_version_from_url(src)
+            if version and not re.search(r'\d+\.\d+(\.\d+)?', filename):
+                filename = add_version_to_filename(filename, version)
+            local_path = os.path.join(js_dir, filename)
+            download_file(src, local_path)
+            script["src"] = f"js/{filename}"  # Relative path to js/ folder
+            new_lines.append(str(script) + "\n")
+            modified = True
+        elif link and is_external_url(link["href"]):
+            href = link["href"]
+            filename = os.path.basename(urlparse(href).path)
+            if filename not in skip_files:
+                version = extract_version_from_url(href)
+                if version and not re.search(r'\d+\.\d+(\.\d+)?', filename):
+                    filename = add_version_to_filename(filename, version)
+                local_path = os.path.join(css_dir, filename)
+                download_file(href, local_path)
+            link["href"] = f"css/{filename}"  # Relative path to css/ folder
+            new_lines.append(str(link) + "\n")
+            modified = True
+        else:
+            new_lines.append(line)
+
+    if modified:
+        with open(html_file_path, "w", encoding="utf-8") as f:
+            f.writelines(new_lines)
+        print(f"Updated {html_file_path}")
+
+def main():
+    parser = argparse.ArgumentParser(description='Process HTML files to 
localize external JS and CSS references.')
+    parser.add_argument('folder', help='Folder of svn checkout 
(https://svn.apache.org/repos/infra/sites/solr/guide/)')
+    args = parser.parse_args()
+
+    base_dir = args.folder
+
+    # Iterate over the folder structure
+    folders = [name for name in os.listdir(base_dir) if re.match(r'\d+_\d+', 
name)]
+    if not folders:
+        print(f"No versioned directories 'N_M' found in {base_dir}, exiting.")
+        return
+    for root_dir in folders:
+        print(f"\nProcessing directory {root_dir}")
+        print(f"=================================")
+        full_path = os.path.join(base_dir, root_dir)
+        if not os.path.exists(full_path):
+            print(f"Directory {full_path} not found, skipping.")
+            continue
+
+        js_dir = os.path.join(full_path, "js")
+        css_dir = os.path.join(full_path, "css")
+        os.makedirs(js_dir, exist_ok=True)
+        os.makedirs(css_dir, exist_ok=True)
+
+        skip_files = ["font-awesome.min.css"]
+
+        # Process each HTML file in the directory
+        for filename in os.listdir(full_path):
+            if filename.endswith(".html"):
+                html_file_path = os.path.join(full_path, filename)
+                process_html_file(html_file_path, js_dir, css_dir, skip_files)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/dev-tools/scripts/requirements.txt 
b/dev-tools/scripts/requirements.txt
index efde9acc402..c86c9823c1c 100644
--- a/dev-tools/scripts/requirements.txt
+++ b/dev-tools/scripts/requirements.txt
@@ -6,4 +6,6 @@ ics~=0.7.2
 console-menu~=0.7.1
 PyGithub~=2.1.1
 jira~=3.4.1
-json
\ No newline at end of file
+json
+bs4
+requests
\ No newline at end of file

Reply via email to