This is an automated email from the ASF dual-hosted git repository. sebb pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/whimsy.git
commit 736aa0fdc2d4517b1e8617b17bc7ee061f1cbdda Author: Sebb <s...@apache.org> AuthorDate: Thu Apr 25 14:00:51 2024 +0100 Show anchor for error URLs --- tools/site-scan.rb | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/site-scan.rb b/tools/site-scan.rb index 1a907412..3790ee23 100755 --- a/tools/site-scan.rb +++ b/tools/site-scan.rb @@ -7,7 +7,6 @@ # # Makes no value judgements. Simply extracts raw data for offline analysis. $LOAD_PATH.unshift '/srv/whimsy/lib' -require 'set' require 'net/http' require 'nokogiri' require 'json' @@ -103,7 +102,7 @@ def parse(id, site, name, podling=false) end data[:uri] = uri.to_s - subpages = Set.new + subpages = Hash.new # FIRST: scan each link's a_href to see if we need to capture it # also capture script src for events, and some page refs for podlings doc.traverse do |a| @@ -167,7 +166,7 @@ def parse(id, site, name, podling=false) site2 = URI.join(site,a_href.gsub(' ','%20')) # HACK end if site2.host == uri.host and site2.path.size > 2 - subpages.add site2.to_s + subpages[site2.to_s] = a end rescue StandardError => e $stderr.puts "#{id}: Bad a_href #{a_href} #{e}" @@ -205,7 +204,7 @@ def parse(id, site, name, podling=false) if podling hasdisclaimer = 0 nodisclaimer = [] - subpages.each do |subpage| + subpages.each do |subpage, anchor| begin uri, response, status = $cache.get(subpage) if uri&.to_s == subpage or uri&.to_s == subpage + '/' @@ -219,6 +218,8 @@ def parse(id, site, name, podling=false) else nodisclaimer << subpage end + else + $stderr.puts "#{id} #{subpage} => #{uri} #{status} '#{anchor.text.strip}'" end rescue URI::InvalidURIError end