bin/crashreportScraper.py |   25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

New commits:
commit cc47577dbf5876dd9804305f5fe33c836d80b2c1
Author:     Xisco Fauli <[email protected]>
AuthorDate: Thu Jan 8 17:47:33 2026 +0100
Commit:     Xisco Fauli <[email protected]>
CommitDate: Fri Jan 9 09:58:30 2026 +0100

    crashreportScraper: fix script
    
    In order to avoid 'Remote end closed connection without response'
    
    Change-Id: I9bf496d7e2ab936fc399f1eda2e691f8ea1dcf9a
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/196863
    Tested-by: Xisco Fauli <[email protected]>
    Reviewed-by: Xisco Fauli <[email protected]>

diff --git a/bin/crashreportScraper.py b/bin/crashreportScraper.py
index e3c8c0328236..48331d4e280a 100755
--- a/bin/crashreportScraper.py
+++ b/bin/crashreportScraper.py
@@ -63,11 +63,12 @@ def convert_str_to_date(value):
     value = ", ".join(value.split(", ")[:-1])
     return datetime.strptime(value, '%b %d, %Y')
 
-def parse_version_url(url):
+def parse_version_url(version, session):
     crashReports = {}
+    url = "https://crashreport.libreoffice.org/stats/version/"; + version + 
"?limit=1000&days=30"
 
     try:
-        html_text = requests.get(url, timeout=200).text
+        html_text = session.get(url, timeout=200).text
         soup = BeautifulSoup(html_text, 'html.parser')
     except requests.exceptions.Timeout:
         print("Timeout requesting " + url)
@@ -84,9 +85,10 @@ def parse_version_url(url):
 
     return crashReports
 
-def parse_reports_and_get_most_recent_report_from_last_page(url):
+def parse_reports_and_get_most_recent_report_from_last_page(signature, 
session):
     try:
-        html_text = requests.get(url, timeout=200).text
+        url = "https://crashreport.libreoffice.org/stats/signature/"; + 
signature
+        html_text = session.get(url, timeout=200).text
         soup = BeautifulSoup(html_text, 'html.parser')
     except requests.exceptions.Timeout:
         print("Timeout")
@@ -132,9 +134,10 @@ def 
parse_reports_and_get_most_recent_report_from_last_page(url):
 
     return count, ID, OS
 
-def parse_details_and_get_info(url, gitRepo, gitBranch):
+def parse_details_and_get_info(crashId, session, gitRepo, gitBranch):
     try:
-        html_text = requests.get(url, timeout=200).text
+        url = "https://crashreport.libreoffice.org/stats/crash_details/"; + 
crashID
+        html_text = session.get(url, timeout=200).text
         soup = BeautifulSoup(html_text, 'html.parser')
     except requests.exceptions.Timeout:
         print("Timeout")
@@ -193,8 +196,10 @@ if __name__ == '__main__':
 
     gitBranch = git.Repo(args.repository).active_branch.name
 
-    crashes = parse_version_url(
-            "https://crashreport.libreoffice.org/stats/version/"; + 
args.version + "?limit=1000&days=30")
+    session = requests.Session()
+    session.headers.update({'Referer': 'https://crashreport.libreoffice.org'})
+
+    crashes = parse_version_url(args.version, session)
 
     print(str(len(crashes)) + " crash reports in version " + args.version)
 
@@ -221,12 +226,12 @@ if __name__ == '__main__':
                 f.write("<tr>")
                 try:
                     crashCount, crashID, crashOS = 
parse_reports_and_get_most_recent_report_from_last_page(
-                            
"https://crashreport.libreoffice.org/stats/signature/"; + urllib.parse.quote(k))
+                            urllib.parse.quote(k), session)
                     if crashCount == 0:
                         continue
 
                     crashReason, codeStack, unoCommands = 
parse_details_and_get_info(
-                            
"https://crashreport.libreoffice.org/stats/crash_details/"; + crashID, 
args.repository, gitBranch)
+                            crashID, session, args.repository, gitBranch)
                     ratio = round(crashCount / ((lDate[2] - lDate[1]).days + 
1), 2)
                     count += 1
                     f.write("<td id=\"td1\">" + str(count) + "</td>")

Reply via email to