This is an automated email from the ASF dual-hosted git repository.

sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git


The following commit(s) were added to refs/heads/master by this push:
     new f5767fd9 Only want http(s) links; ignore some extensions
f5767fd9 is described below

commit f5767fd9bc88189c9f6873d6c381dd28f4d8fabd
Author: Sebb <[email protected]>
AuthorDate: Wed Oct 15 16:54:33 2025 +0100

    Only want http(s) links; ignore some extensions
---
 tools/site-scan.rb | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/tools/site-scan.rb b/tools/site-scan.rb
index bce50ce5..6728e5db 100755
--- a/tools/site-scan.rb
+++ b/tools/site-scan.rb
@@ -160,21 +160,24 @@ def parse(id, site, name, podling=false)
         end
       end
     end
-    unless a_href =~ %r{^(#|mailto:)}
-      begin
-        if a_href =~ %r{^https?://} # no need to rebase this
-          site2 = URI.parse(a_href.gsub(' ','%20').gsub('|', '%7C')) # needs 
to be a URI
-        else
-          site2 = URI.join(site,a_href.gsub(' ','%20').gsub('|', '%7C')) # HACK
-        end
-        # podling sites are reachable via two urls (allow for nil)
-        if site2.host&.sub('.incubator.', '.') == uri.host&.sub('.incubator.', 
'.') and site2.path.size > 2
-          subpages[site2.to_s] = a
-        end
-      rescue StandardError => e
-        if show_anyway or 
!a_href.include?('producthunt.com/products/apache-echarts') # reported but not 
yet fixed, so limit report frequency
-          $stderr.puts "@#{__LINE__}: #{id}: Bad a_href #{a_href} #{e}"
-        end
+    begin
+      # only want http(s) or relative links
+      href_uri = URI.parse(a_href.gsub(' ','%20').gsub('|', '%7C')) # needs to 
be a URI
+      scheme = href_uri.scheme
+      if %w(http https).include? scheme # no need to rebase this
+        site2 = href_uri
+      elsif scheme.nil? # relative
+        site2 = URI.join(site, href_uri.path) # HACK
+      else # something else
+        site2 = nil
+      end
+      # podling sites are reachable via two urls (allow for nil)
+      if !site2.nil? and site2.host&.sub('.incubator.', '.') == 
uri.host&.sub('.incubator.', '.') and site2.path.size > 2
+        subpages[site2.to_s] = a
+      end
+    rescue StandardError => e
+      if show_anyway or 
!a_href.include?('producthunt.com/products/apache-echarts') # reported but not 
yet fixed, so limit report frequency
+        $stderr.puts "@#{__LINE__}: #{id}: Bad a_href #{a_href} #{e}"
       end
     end
   end
@@ -208,7 +211,7 @@ def parse(id, site, name, podling=false)
   hasdisclaimer = 0
   nodisclaimer = []
   subpages.each do |subpage, anchor|
-    if podling
+    if podling and not %w{.png .pdf .jpg}.include?File.extname(subpage)
       begin
         uri, response, status = $cache.get(subpage)
         if uri&.to_s == subpage or uri&.to_s == subpage + '/'

Reply via email to