control: severity 767666 important control: tags 767666 patch control: tags 767666 pending
Hi, this is not problem for -k but also for -c or any basic invocation. If there is a text data with "copyright" and repeated number text as in some dictionaries, these are treated as copyright text and tries to be parsed. Since these could be huge, it fills up memory and slows down extensively. Since these are useless false positives, it is better to through out after printing warning message. This is sanity check :-) Here is a patch to fix this. Osamu
>From 497b977345e57ea41a4812cece735f37989b48f6 Mon Sep 17 00:00:00 2001 From: Osamu Aoki <[email protected]> Date: Sun, 9 Nov 2014 00:34:10 +0900 Subject: [PATCH] Sanity check for copyright lines --- debmake/copyright.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/debmake/copyright.py b/debmake/copyright.py index bc4caf5..4e56703 100755 --- a/debmake/copyright.py +++ b/debmake/copyright.py @@ -33,6 +33,11 @@ import sys import debmake.debug import debmake.lc ################################################################### +# Constants for sanity +################################################################### +MAX_COPYRIGHT_LINES = 256 +MAX_COPYRIGHT_LENGTH = 2048 +################################################################### # Parse year within a copyright line ################################################################### re_year_1900 = re.compile(r''' @@ -673,6 +678,19 @@ def check_lines(lines): debmake.debug.debug('De: *end* format={}->{}, content={}->{}, copyright={}, license={}: "{}"'.format(fs[xformat_state], fs[format_state], cs[xcontent_state], cs[content_state], copyright_found, license_found, line), type='e') ########################################################################## # main loop for lines (end) + # sanitize copyright_lines + ########################################################################## + if len(copyright_lines) > MAX_COPYRIGHT_LINES: + print('W: !!!!! too many copyright lines !!!!!', file=sys.stderr) + print('W: starting with {}'.format(copyright_lines[0]), file=sys.stderr) + copyright_lines = copyright_lines[:MAX_COPYRIGHT_LINES] + for (i, line) in enumerate(copyright_lines): + if len(line) > MAX_COPYRIGHT_LENGTH: + copyright_lines[i] = line[:MAX_COPYRIGHT_LENGTH] + print('W: !!!!! too long copyright line !!!!!', file=sys.stderr) + print('W: starting with {}'.format(copyright_lines[i]), file=sys.stderr) + ########################################################################## + # analyze copyright ########################################################################## copyright_data = analyze_copyright(copyright_lines) license_lines = clean_license(license_lines) -- 2.1.3

