bin/crashreportScraper.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-)
New commits: commit cc47577dbf5876dd9804305f5fe33c836d80b2c1 Author: Xisco Fauli <[email protected]> AuthorDate: Thu Jan 8 17:47:33 2026 +0100 Commit: Xisco Fauli <[email protected]> CommitDate: Fri Jan 9 09:58:30 2026 +0100 crashreportScraper: fix script In order to avoid 'Remote end closed connection without response' Change-Id: I9bf496d7e2ab936fc399f1eda2e691f8ea1dcf9a Reviewed-on: https://gerrit.libreoffice.org/c/core/+/196863 Tested-by: Xisco Fauli <[email protected]> Reviewed-by: Xisco Fauli <[email protected]> diff --git a/bin/crashreportScraper.py b/bin/crashreportScraper.py index e3c8c0328236..48331d4e280a 100755 --- a/bin/crashreportScraper.py +++ b/bin/crashreportScraper.py @@ -63,11 +63,12 @@ def convert_str_to_date(value): value = ", ".join(value.split(", ")[:-1]) return datetime.strptime(value, '%b %d, %Y') -def parse_version_url(url): +def parse_version_url(version, session): crashReports = {} + url = "https://crashreport.libreoffice.org/stats/version/" + version + "?limit=1000&days=30" try: - html_text = requests.get(url, timeout=200).text + html_text = session.get(url, timeout=200).text soup = BeautifulSoup(html_text, 'html.parser') except requests.exceptions.Timeout: print("Timeout requesting " + url) @@ -84,9 +85,10 @@ def parse_version_url(url): return crashReports -def parse_reports_and_get_most_recent_report_from_last_page(url): +def parse_reports_and_get_most_recent_report_from_last_page(signature, session): try: - html_text = requests.get(url, timeout=200).text + url = "https://crashreport.libreoffice.org/stats/signature/" + signature + html_text = session.get(url, timeout=200).text soup = BeautifulSoup(html_text, 'html.parser') except requests.exceptions.Timeout: print("Timeout") @@ -132,9 +134,10 @@ def parse_reports_and_get_most_recent_report_from_last_page(url): return count, ID, OS -def parse_details_and_get_info(url, gitRepo, gitBranch): +def parse_details_and_get_info(crashId, session, gitRepo, gitBranch): try: - html_text = requests.get(url, timeout=200).text + url = "https://crashreport.libreoffice.org/stats/crash_details/" + crashID + html_text = session.get(url, timeout=200).text soup = BeautifulSoup(html_text, 'html.parser') except requests.exceptions.Timeout: print("Timeout") @@ -193,8 +196,10 @@ if __name__ == '__main__': gitBranch = git.Repo(args.repository).active_branch.name - crashes = parse_version_url( - "https://crashreport.libreoffice.org/stats/version/" + args.version + "?limit=1000&days=30") + session = requests.Session() + session.headers.update({'Referer': 'https://crashreport.libreoffice.org'}) + + crashes = parse_version_url(args.version, session) print(str(len(crashes)) + " crash reports in version " + args.version) @@ -221,12 +226,12 @@ if __name__ == '__main__': f.write("<tr>") try: crashCount, crashID, crashOS = parse_reports_and_get_most_recent_report_from_last_page( - "https://crashreport.libreoffice.org/stats/signature/" + urllib.parse.quote(k)) + urllib.parse.quote(k), session) if crashCount == 0: continue crashReason, codeStack, unoCommands = parse_details_and_get_info( - "https://crashreport.libreoffice.org/stats/crash_details/" + crashID, args.repository, gitBranch) + crashID, session, args.repository, gitBranch) ratio = round(crashCount / ((lDate[2] - lDate[1]).days + 1), 2) count += 1 f.write("<td id=\"td1\">" + str(count) + "</td>")
