This is an automated email from the ASF dual-hosted git repository.

sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git


The following commit(s) were added to refs/heads/master by this push:
     new 8b8dc5fb Track when error reports last issued to reduce noise
8b8dc5fb is described below

commit 8b8dc5fb9740805d9ff7568b3a9e84adcf1b2309
Author: Sebb <[email protected]>
AuthorDate: Wed Oct 15 17:42:37 2025 +0100

    Track when error reports last issued to reduce noise
---
 tools/site-scan.rb | 55 +++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/tools/site-scan.rb b/tools/site-scan.rb
index 6728e5db..46267371 100755
--- a/tools/site-scan.rb
+++ b/tools/site-scan.rb
@@ -63,12 +63,28 @@ end
   bits.join(' ')
 end
 
+def report_error(data, site, key, message)
+  if $show_anyway
+    $stderr.puts message
+    data[:error_reports][key] = $time_now
+  else
+    last_reported = ($previous_data[site]||{})[key] || 0
+    if $time_now - last_reported > SECS_PER_DAY
+      $stderr.puts message
+      data[:error_reports][key] = $time_now
+    else
+      data[:error_reports][key] = last_reported
+    end
+  end
+end
+
 # Parse an Apache project website and return text|urls that match our checks
 # @return Hash of symbols: text|url found from a check made
 # @see SiteStandards for definitions of what we should scan for (in general)
 def parse(id, site, name, podling=false)
-  show_anyway = Time.now.gmtime.strftime('%H') == '08' # show suppressed 
errors once a day
   data = {}
+   # keep track of error reports to suppress unnecessary repeats
+  data[:error_reports] = {} # key = url, value = last reported epoch
   # force https to avoid issue with cache (sites should use https anyway)
   site.sub!(%r{^http:},'https:')
   SiteStandards::COMMON_CHECKS.each_key do |k|
@@ -176,9 +192,7 @@ def parse(id, site, name, podling=false)
         subpages[site2.to_s] = a
       end
     rescue StandardError => e
-      if show_anyway or 
!a_href.include?('producthunt.com/products/apache-echarts') # reported but not 
yet fixed, so limit report frequency
-        $stderr.puts "@#{__LINE__}: #{id}: Bad a_href #{a_href} #{e}"
-      end
+      report_error data, id, a_href, "@#{__LINE__}: #{id}: Bad a_href 
#{a_href} #{e}"
     end
   end
 
@@ -226,9 +240,7 @@ def parse(id, site, name, podling=false)
             nodisclaimer << subpage
           end
         else
-          if show_anyway or !%w(gluten).include? id # reported, so suppress 
multiple reports
-            $stderr.puts "@#{__LINE__}: #{id} #{subpage} => #{uri} #{status} 
'#{anchor.text.strip}'"
-          end
+          report_error data, id, subpage, "@#{__LINE__}: #{id} #{subpage} => 
#{uri} #{status} '#{anchor.text.strip}'"
         end
       rescue URI::InvalidURIError
         # ignore
@@ -345,6 +357,22 @@ ensure
   return stdout, stderr, status
 end
 
+def get_report_data(input)
+    out = {}
+    begin
+      data = JSON.parse(File.read(input, :encoding => 'utf-8')).each do |k,v|
+        er = v['error_reports']
+        if er
+          # Drop stale data
+          out[k] = er.select {|u,t| $time_now - t <= 2 * SECS_PER_DAY}
+        end
+      end
+    rescue StandardError => e
+      $stderr.puts e
+    end
+    out
+end
+
 #########################################################################
 # Main execution begins here
 results = {}
@@ -354,6 +382,9 @@ $verbose = ARGV.delete '--verbose'
 $saveparse = ARGV.delete '--saveparse'
 $skipresourcecheck = ARGV.delete '--noresource'
 $podling = ARGV.delete('--podling')
+$time_now = Time.now.gmtime.to_i
+$show_anyway = Time.now.gmtime.strftime('%H') == '08' # show suppressed errors 
once a day
+SECS_PER_DAY=24*60*60
 
 sites_checked = 0
 sites_failed = 0
@@ -388,6 +419,11 @@ else
     output_projects = nil
   end
 
+  $previous_data = {}
+  if output_projects
+    $previous_data = get_report_data output_projects
+  end
+
   # Scan committees, including non-pmcs
   ASF::Committee.load_committee_info
   committees = (ASF::Committee.pmcs + ASF::Committee.nonpmcs).uniq
@@ -406,6 +442,11 @@ else
     $skipresourcecheck = ($skipresourcecheck or sites_failed > 10 or 
(sites_failed > 3 and sites_failed == sites_checked))
   end
 
+  $previous_data = {}
+  if output_podlings
+    $previous_data = get_report_data output_podlings
+  end
+
   # Scan podlings that have a website
   ASF::Podling.list.sort_by(&:name).each do |podling|
     if podling.status == 'current' and podling.podlingStatus[:website]

Reply via email to