(tooling-trusted-release) branch main updated: Make license file and header checks much more lenient

sbp Tue, 13 May 2025 07:00:19 -0700

This is an automated email from the ASF dual-hosted git repository.

sbp pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tooling-trusted-release.git



The following commit(s) were added to refs/heads/main by this push:
     new 657bf05  Make license file and header checks much more lenient
657bf05 is described below

commit 657bf05b2a6ad8c748806b6d5dd16bc6fb4689eb
Author: Sean B. Palmer <[email protected]>
AuthorDate: Tue May 13 14:49:50 2025 +0100

    Make license file and header checks much more lenient
---
 atr/tasks/checks/license.py   | 262 ++++++++++--------------------------------
 atr/tasks/checks/zipformat.py |  17 +--
 2 files changed, 66 insertions(+), 213 deletions(-)

diff --git a/atr/tasks/checks/license.py b/atr/tasks/checks/license.py
index ac7f6db..fba7f68 100644
--- a/atr/tasks/checks/license.py
+++ b/atr/tasks/checks/license.py
@@ -32,113 +32,19 @@ _LOGGER: Final = logging.getLogger(__name__)
 
 
 # Constant that must be present in the Apache License header
-APACHE_LICENSE_HEADER: Final[bytes] = b"""\
-Licensed to the Apache Software Foundation (ASF) under one
-or more contributor license agreements.  See the NOTICE file
-distributed with this work for additional information
-regarding copyright ownership.  The ASF licenses this file
-to you under the Apache License, Version 2.0 (the
-"License"); you may not use this file except in compliance
-with the License.  You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing,
-software distributed under the License is distributed on an
-"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-KIND, either express or implied.  See the License for the
-specific language governing permissions and limitations
-under the License."""
-
-
-# File type comment style definitions
-# Ordered by their popularity in the Stack Overflow Developer Survey 2024
-COMMENT_STYLES: Final[dict[str, dict[str, str]]] = {
-    # JavaScript and variants
-    "js": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "mjs": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "cjs": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "jsx": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    # Python
-    "py": {"single": "#", "multi_start": '"""', "multi_end": '"""'},
-    # SQL
-    "sql": {"single": "--", "multi_start": "/*", "multi_end": "*/"},
-    "ddl": {"single": "--", "multi_start": "/*", "multi_end": "*/"},
-    "dml": {"single": "--", "multi_start": "/*", "multi_end": "*/"},
-    # TypeScript and variants
-    "ts": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "tsx": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "mts": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "cts": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    # Shell scripts
-    "sh": {"single": "#"},
-    "bash": {"single": "#"},
-    "zsh": {"single": "#"},
-    "ksh": {"single": "#"},
-    # Java
-    "java": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "jav": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    # C#
-    "cs": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "csx": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    # C++
-    "cpp": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "cxx": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "cc": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "hpp": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    # C
-    "c": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "h": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    # PHP
-    "php": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "phtml": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    # PowerShell
-    "ps1": {"single": "#", "multi_start": "<#", "multi_end": "#>"},
-    "psm1": {"single": "#", "multi_start": "<#", "multi_end": "#>"},
-    "psd1": {"single": "#", "multi_start": "<#", "multi_end": "#>"},
-    # Go
-    "go": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    # Rust
-    "rs": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    # Kotlin
-    "kt": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "kts": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    # Lua
-    "lua": {"single": "--", "multi_start": "--[[", "multi_end": "]]"},
-    # Dart
-    "dart": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    # Assembly
-    "asm": {"single": ";"},
-    "s": {"single": "#"},
-    "S": {"single": "#"},
-    # Ruby
-    "rb": {"single": "#", "multi_start": "=begin", "multi_end": "=end"},
-    "rbw": {"single": "#", "multi_start": "=begin", "multi_end": "=end"},
-    # Swift
-    "swift": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    # R
-    "r": {"single": "#"},
-    "R": {"single": "#"},
-    # Visual Basic
-    "vb": {"single": "'", "multi_start": "/*", "multi_end": "*/"},
-    "vbs": {"single": "'", "multi_start": "/*", "multi_end": "*/"},
-    # MATLAB
-    "m": {"single": "%", "multi_start": "%{", "multi_end": "%}"},
-    # VBA
-    "vba": {"single": "'"},
-    # Groovy
-    "groovy": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "gvy": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "gy": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "gsh": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    # Scala
-    "scala": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    "sc": {"single": "//", "multi_start": "/*", "multi_end": "*/"},
-    # Perl
-    "pl": {"single": "#", "multi_start": "=pod", "multi_end": "=cut"},
-    "pm": {"single": "#", "multi_start": "=pod", "multi_end": "=cut"},
-    "t": {"single": "#", "multi_start": "=pod", "multi_end": "=cut"},
-}
+HTTP_APACHE_LICENSE_HEADER: Final[bytes] = (
+    b"Licensed to the Apache Software Foundation ASF under one or mor"
+    b"e contributor license agreements See the NOTICE file distribute"
+    b"d with this work for additional information regarding copyright"
+    b" ownership The ASF licenses this file to you under the Apache L"
+    b"icense Version 2 0 the License you may not use this file except"
+    b" in compliance with the License You may obtain a copy of the Li"
+    b"cense at http www apache org licenses LICENSE 2 0 Unless requir"
+    b"ed by applicable law or agreed to in writing software distribut"
+    b"ed under the License"
+)
+
+HTTPS_APACHE_LICENSE_HEADER: Final[bytes] = 
HTTP_APACHE_LICENSE_HEADER.replace(b" http ", b" https ")
 
 # Patterns for files to include in license header checks
 # Ordered by their popularity in the Stack Overflow Developer Survey 2024
@@ -262,52 +168,24 @@ async def headers(args: checks.FunctionArguments) -> str 
| None:
     return None
 
 
-def strip_comments(content: bytes, file_ext: str) -> bytes:
-    """Strip comment prefixes from the content based on the file extension."""
-    if file_ext not in COMMENT_STYLES:
-        return content
-
-    comment_style = COMMENT_STYLES[file_ext]
-    lines = content.split(b"\n")
-    cleaned_lines = []
-
-    # Get comment markers as bytes
-    multi_start = comment_style.get("multi_start", "").encode()
-    multi_end = comment_style.get("multi_end", "").encode()
-    single = comment_style.get("single", "").encode()
-
-    # State tracking
-    in_multiline = False
-    is_c_style = (multi_start == b"/*") and (multi_end == b"*/")
-
-    for line in lines:
-        line = line.strip()
-
-        # Handle start of multi-line comment
-        if not in_multiline and multi_start and multi_start in line:
-            # Get content after multi-start
-            line = line[line.find(multi_start) + len(multi_start) :].strip()
-            in_multiline = True
-
-        # Handle end of multi-line comment
-        elif in_multiline and multi_end and multi_end in line:
-            # Get content before multi-end
-            line = line[: line.find(multi_end)].strip()
-            in_multiline = False
-
-        # Handle single-line comments
-        elif not in_multiline and single and line.startswith(single):
-            line = line[len(single) :].strip()
+def headers_validate(content: bytes, _filename: str) -> tuple[bool, str | 
None]:
+    """Validate that the content contains the Apache License header."""
+    r_span = re.compile(rb"Licensed to the.*?under the License", re.MULTILINE)
+    r_words = re.compile(rb"[A-Za-z0-9]+")
 
-        # For C style comments, strip leading asterisk if present
-        elif is_c_style and in_multiline and line.startswith(b"*"):
-            line = line[1:].strip()
+    # Normalise the content
+    content = re.sub(rb"[ \t\r\n]+", b" ", content)
 
-        # Only add non-empty lines
-        if line:
-            cleaned_lines.append(line)
-
-    return b"\n".join(cleaned_lines)
+    # For each matching heuristic span...
+    for span in r_span.finditer(content):
+        # Get only the words in the span
+        words = r_words.findall(span.group(0))
+        joined = b" ".join(words)
+        if joined == HTTP_APACHE_LICENSE_HEADER:
+            return True, None
+        elif joined == HTTPS_APACHE_LICENSE_HEADER:
+            return True, None
+    return False, "Could not find Apache License header"
 
 
 # File helpers
@@ -355,14 +233,30 @@ def _files_check_core_logic(artifact_path: str) -> 
Iterator[Result]:
 
     yield from _files_messages_build(files_found, license_ok, notice_ok, 
notice_issues)
 
-    is_valid = license_ok and notice_ok
-    yield ArtifactResult(
-        status=models.CheckResultStatus.SUCCESS if is_valid else 
models.CheckResultStatus.FAILURE,
-        message="LICENSE and NOTICE files present and valid"
-        if is_valid
-        else "Issues found with LICENSE or NOTICE files",
-        data=None,
-    )
+    if license_ok and notice_ok:
+        yield ArtifactResult(
+            status=models.CheckResultStatus.SUCCESS,
+            message="LICENSE and NOTICE files present and valid",
+            data=None,
+        )
+    elif license_ok:
+        yield ArtifactResult(
+            status=models.CheckResultStatus.FAILURE,
+            message="LICENSE file present but NOTICE file is not valid",
+            data=None,
+        )
+    elif notice_ok:
+        yield ArtifactResult(
+            status=models.CheckResultStatus.FAILURE,
+            message="NOTICE file present but LICENSE file is not valid",
+            data=None,
+        )
+    else:
+        yield ArtifactResult(
+            status=models.CheckResultStatus.FAILURE,
+            message="LICENSE and NOTICE files are not valid",
+            data=None,
+        )
 
 
 def _files_check_core_logic_license(tf: tarfile.TarFile, member: 
tarfile.TarInfo) -> bool:
@@ -371,10 +265,15 @@ def _files_check_core_logic_license(tf: tarfile.TarFile, 
member: tarfile.TarInfo
     if not f:
         return False
 
+    sha3_expected = 
"5efa4839f385df309ffc022ca5ce9763c4bc709dab862ca77d9a894db6598456"
     sha3 = hashlib.sha3_256()
-    content = f.read()
-    sha3.update(content[:11358])
-    return sha3.hexdigest() == 
"8a0a8fb6c73ef27e4322391c7b28e5b38639e64e58c40a2c7a51cec6e7915a6a"
+    for line in f:
+        octets = line.strip(b" \t\r\n")
+        if octets:
+            sha3.update(octets)
+        if sha3.hexdigest() == sha3_expected:
+            return True
+    return False
 
 
 def _files_check_core_logic_notice(tf: tarfile.TarFile, member: 
tarfile.TarInfo) -> tuple[bool, list[str]]:
@@ -536,8 +435,8 @@ def _headers_check_core_logic_process_file(
 
         # Allow for some extra content at the start of the file
         # That may be shebangs, encoding declarations, etc.
-        content = f.read(len(APACHE_LICENSE_HEADER) + 512)
-        is_valid, error = _headers_validate(content, member.name)
+        content = f.read(4096)
+        is_valid, error = headers_validate(content, member.name)
         if is_valid:
             return MemberResult(
                 status=models.CheckResultStatus.SUCCESS,
@@ -567,10 +466,6 @@ def _headers_check_core_logic_should_check(filepath: str) 
-> bool:
     if ext is None:
         return False
 
-    # First check if we have comment style definitions for this extension
-    if ext not in COMMENT_STYLES:
-        return False
-
     # Then check if the file matches any of our included patterns
     for pattern in INCLUDED_PATTERNS:
         if re.search(pattern, filepath, re.IGNORECASE):
@@ -579,37 +474,6 @@ def _headers_check_core_logic_should_check(filepath: str) 
-> bool:
     return False
 
 
-def _headers_validate(content: bytes, filename: str) -> tuple[bool, str | 
None]:
-    """Validate that the content contains the Apache License header after 
removing comments."""
-    # Get the file extension from the filename
-    file_ext = _get_file_extension(filename)
-    if not file_ext or file_ext not in COMMENT_STYLES:
-        return False, "Could not determine file type from extension"
-
-    # Strip comments, removing empty lines in the process
-    cleaned_header = strip_comments(content, file_ext)
-
-    # Normalise the expected header in the same way as directly above
-    expected_lines = [line.strip() for line in 
APACHE_LICENSE_HEADER.split(b"\n")]
-    expected_lines = [line for line in expected_lines if line]
-    expected_header = b"\n".join(expected_lines)
-
-    # Check if the cleaned header contains the expected text
-    if expected_header not in cleaned_header:
-        # # Find the first difference for debugging
-        # cleaned_lines = cleaned_header.split(b"\n")
-        # expected_lines = expected_header.split(b"\n")
-        # for i, (c, e) in enumerate(zip(cleaned_lines, expected_lines)):
-        #     if c != e:
-        #         _LOGGER.debug("\nFirst difference at line %d:", i + 1)
-        #         _LOGGER.debug("Expected: '%s'", e.decode(errors="replace"))
-        #         _LOGGER.debug("Got:      '%s'", c.decode(errors="replace"))
-        #         break
-        return False, "License header does not match the required Apache 
License header text"
-
-    return True, None
-
-
 async def _record_artifact(recorder: checks.Recorder, result: ArtifactResult) 
-> None:
     match result.status:
         case models.CheckResultStatus.SUCCESS:
diff --git a/atr/tasks/checks/zipformat.py b/atr/tasks/checks/zipformat.py
index 542e301..f4b0351 100644
--- a/atr/tasks/checks/zipformat.py
+++ b/atr/tasks/checks/zipformat.py
@@ -279,12 +279,7 @@ def _license_headers_check_should_check_zip(member_path: 
str, extension: str) ->
     """Determine whether a file in a zip should be checked for license 
headers."""
     for pattern in license.INCLUDED_PATTERNS:
         if license.re.match(pattern, f".{extension}"):
-            # Also check whether we have a comment style defined for it
-            if license.COMMENT_STYLES.get(extension):
-                return True
-            else:
-                _LOGGER.warning(f"No comment style defined for included 
extension '{extension}' in {member_path}")
-                return False
+            return True
     return False
 
 
@@ -295,14 +290,8 @@ def _license_headers_check_single_file_zip(
     member_path = member_info.filename
     try:
         with zf.open(member_path) as file_in_zip:
-            content_bytes = file_in_zip.read(2048)
-            header_bytes = license.strip_comments(content_bytes, extension)
-            expected_header_bytes = license.APACHE_LICENSE_HEADER
-            if header_bytes == expected_header_bytes:
-                return True, None
-            else:
-                # Header mismatch
-                return False, None
+            content_bytes = file_in_zip.read(4096)
+            return license.headers_validate(content_bytes, member_path)
     except Exception as read_error:
         return False, f"{member_path} (Read Error: {read_error})"
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(tooling-trusted-release) branch main updated: Make license file and header checks much more lenient

Reply via email to