This is an automated email from the ASF dual-hosted git repository. sebb pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/whimsy.git
The following commit(s) were added to refs/heads/master by this push: new 87295b5c Look for disclaimer away from home page 87295b5c is described below commit 87295b5c305ba64447012d3285c97592b3c6d5d5 Author: Sebb <s...@apache.org> AuthorDate: Thu Apr 25 01:03:47 2024 +0100 Look for disclaimer away from home page TODO: add this into reports --- tools/site-scan.rb | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/tools/site-scan.rb b/tools/site-scan.rb index 4e267580..f57a832e 100755 --- a/tools/site-scan.rb +++ b/tools/site-scan.rb @@ -7,6 +7,7 @@ # # Makes no value judgements. Simply extracts raw data for offline analysis. $LOAD_PATH.unshift '/srv/whimsy/lib' +require 'set' require 'net/http' require 'nokogiri' require 'json' @@ -66,7 +67,7 @@ end # Parse an Apache project website and return text|urls that match our checks # @return Hash of symbols: text|url found from a check made # @see SiteStandards for definitions of what we should scan for (in general) -def parse(id, site, name) +def parse(id, site, name, podling=false) data = {} # force https to avoid issue with cache (sites should use https anyway) site.sub!(%r{^http:},'https:') @@ -102,8 +103,9 @@ def parse(id, site, name) end data[:uri] = uri.to_s + subpages = Set.new # FIRST: scan each link's a_href to see if we need to capture it - # also capture script src for events + # also capture script src for events, and some page refs for podlings doc.traverse do |a| if a.name == 'script' @@ -157,6 +159,16 @@ def parse(id, site, name) end end end + unless a_href =~ %r{^(#|mailto:)} + begin + site2 = URI.join(site,a_href.gsub(' ','+')) + if site2.host == uri.host and site2.path.size > 2 + subpages.add site2.to_s + end + rescue StandardError + $stderr.puts "Bad a_href #{a_href}" + end + end end # SECOND: scan each text node to match and capture @@ -183,6 +195,28 @@ def parse(id, site, name) data[:disclaimer] = t end end + + # Brief scan of initial sub-pages to look for disclaimers + # TODO also look for a download page? + if podling + hasdisclaimer = 0 + nodisclaimer = [] + subpages.each do |subpage| + begin + uri, response, status = $cache.get(subpage) + if response =~ SiteStandards::PODLING_CHECKS['disclaimer'][SiteStandards::CHECK_CAPTURE] + hasdisclaimer += 1 + else + nodisclaimer << subpage + end + rescue URI::InvalidURIError + end + end + if nodisclaimer.size > 0 + data[:disclaimers] = [hasdisclaimer, nodisclaimer] + end + end + # THIRD: see if an image has been uploaded data[:image] = ASF::SiteImage.find(id) @@ -299,10 +333,11 @@ puts "Started: #{Time.now}" # must agree with site-scan monitor # If additional projname|podlingname are provided, only scans those sites if ARGV.first =~ /^https?:\/\/\w/ # Scan a single URL provided by user - site = ARGV.shift + podling = ARGV.delete('--podling') + site = ARGV.shift.dup # needs to be unfrozen name = ARGV.shift || site[/\/(\w[^.]*)/, 1].capitalize output_projects = ARGV.shift - results[name] = parse(name, site, name) + results[name] = parse(name, site, name, podling) else # Gather output filenames (if any) and scan various projects if ARGV.first =~ %r{[./]} # have we a file name? @@ -335,7 +370,7 @@ else if ARGV.length > 0 next unless ARGV.include? podling.name end - podlings[podling.name] = parse(podling.name, podling.podlingStatus[:website], podling.display_name) + podlings[podling.name] = parse(podling.name, podling.podlingStatus[:website], podling.display_name, true) end end end