This is an automated email from the ASF dual-hosted git repository.
sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git
The following commit(s) were added to refs/heads/master by this push:
new af2ab098 Add check for JS events link
af2ab098 is described below
commit af2ab0986aefb5b63fe8bec98d539db59d5c5247
Author: Sebb <[email protected]>
AuthorDate: Sat Aug 19 20:35:17 2023 +0100
Add check for JS events link
---
lib/whimsy/sitestandards.rb | 7 +++++--
tools/site-scan.rb | 50 ++++++++++++++++++++++++++++++++-------------
2 files changed, 41 insertions(+), 16 deletions(-)
diff --git a/lib/whimsy/sitestandards.rb b/lib/whimsy/sitestandards.rb
index 5c59c23c..392a9df1 100644
--- a/lib/whimsy/sitestandards.rb
+++ b/lib/whimsy/sitestandards.rb
@@ -52,10 +52,13 @@ module SiteStandards
CHECK_POLICY =>
'https://www.apache.org/foundation/marks/pmcs#navigation',
CHECK_DOC => 'All projects must feature some prominent link back to the
main ASF homepage at http://www.apache.org/',
},
+ # <script src="https://www.apachecon.com/event-images/snippet.js">
+ # https://events.apache.org/x/current-event[.html]
+ # https://[www.]apache.org/events/current-event[.html]
'events' => { # Custom: a_href.include? 'apache.org/events/' then custom
check for img
CHECK_TEXT => nil,
- CHECK_CAPTURE => %r{apache\.org/events},
- CHECK_VALIDATE => %r{^https?://.*apache.org/events/current-event},
+ CHECK_CAPTURE => %r{(events|x)/current-event|event-images},
+ CHECK_VALIDATE =>
%r{^https?://((www\.)?apache\.org/events/current-event|events\.apache.org/x/current-event|www\.apachecon\.com/event-images/snippet\.js)},
CHECK_TYPE => true,
CHECK_POLICY => 'https://www.apache.org/events/README.txt',
CHECK_DOC => 'Projects SHOULD include a link to any current ApacheCon
event, as provided by VP, Conferences.',
diff --git a/tools/site-scan.rb b/tools/site-scan.rb
index bdac6971..3e1145a3 100755
--- a/tools/site-scan.rb
+++ b/tools/site-scan.rb
@@ -41,6 +41,16 @@ def getText(txt, node, match=/Apache Software Foundation/i)
return txt, parent
end
+# helper for multiple events
+# TODO should we show them all?
+def save_events(data, value)
+ if data[:events]
+ puts "Events: already have #{data[:events]}, not storing #{value}"
+ else
+ data[:events] = value
+ end
+end
+
# Parse an Apache project website and return text|urls that match our checks
# @return Hash of symbols: text|url found from a check made
# @see SiteStandards for definitions of what we should scan for (in general)
@@ -74,7 +84,18 @@ def parse(id, site, name)
data[:uri] = uri.to_s
# FIRST: scan each link's a_href to see if we need to capture it
- doc.css('a').each do |a|
+ # also capture script src for events
+ doc.traverse do |a|
+
+ if a.name == 'script'
+ a_src = a['src'].to_s.strip
+ if a_src =~
SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
+ save_events data, uri + a_src
+ end
+ end
+
+ next unless a.name == 'a'
+
# Normalize the text and href for our capture purposes
a_href = a['href'].to_s.strip
a_text = a.text.downcase.strip
@@ -92,12 +113,8 @@ def parse(id, site, name)
end
if a_href =~
SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
- img = a.at('img')
- if img
- data[:events] = uri + img['src'].strip
- else
- data[:events] = uri + a_href
- end
+ # Hack to ignore hidden links on main site
+ save_events data, uri + a_href unless a['class'] == 'visible-home' and
uri.path != '/'
end
# Check the a_text strings for other patterns
@@ -150,14 +167,18 @@ def parse(id, site, name)
data[:image] = ASF::SiteImage.find(id)
# Check for resource loading from non-ASF domains
- cmd = ['node', '/srv/whimsy/tools/scan-page.js', site]
- out, err, status = exec_with_timeout(cmd, 30)
- if status
- ext_urls = out.split("\n").reject {|x| ASFDOMAIN.asfhost? x}.tally
- resources = ext_urls.values.sum
- data[:resources] = "Found #{resources} external resources: #{ext_urls}"
+ if $skipresourcecheck
+ data[:resources] = "Not checked"
else
- data[:resources] = err
+ cmd = ['node', '/srv/whimsy/tools/scan-page.js', site]
+ out, err, status = exec_with_timeout(cmd, 30)
+ if status
+ ext_urls = out.split("\n").reject {|x| ASFDOMAIN.asfhost? x}.tally
+ resources = ext_urls.values.sum
+ data[:resources] = "Found #{resources} external resources: #{ext_urls}"
+ else
+ data[:resources] = err
+ end
end
# TODO: does not find js references such as:
@@ -214,6 +235,7 @@ results = {}
podlings = {}
$cache = Cache.new(dir: 'site-scan')
$verbose = ARGV.delete '--verbose'
+$skipresourcecheck = ARGV.delete '--noresource'
puts "Started: #{Time.now}" # must agree with site-scan monitor