(tinkerpop) branch master updated: Added license header fix utility CTR

spmallette Sat, 14 Mar 2026 11:59:58 -0700

This is an automated email from the ASF dual-hosted git repository.

spmallette pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tinkerpop.git



The following commit(s) were added to refs/heads/master by this push:
     new fd2c4b4a1d Added license header fix utility CTR
fd2c4b4a1d is described below

commit fd2c4b4a1dc989ff1f90000f6117250e3785a6a1
Author: Stephen Mallette <[email protected]>
AuthorDate: Sat Mar 14 14:59:30 2026 -0400

    Added license header fix utility CTR
---
 bin/fix-license-headers.py | 526 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 526 insertions(+)

diff --git a/bin/fix-license-headers.py b/bin/fix-license-headers.py
new file mode 100755
index 0000000000..fc3ef04266
--- /dev/null
+++ b/bin/fix-license-headers.py
@@ -0,0 +1,526 @@
+#!/usr/bin/env python3
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+"""
+Validates and fixes ASF license headers across the TinkerPop repository.
+
+Handles all comment styles found in the repo:
+  - Java/Go/C# block comments  (* prefix inside /* ... */)
+  - Double-slash                (// prefix)
+  - Hash                        (# prefix)
+  - AsciiDoc block comment      (content inside //// ... ////)
+  - HTML/XML block comment      (content inside <!-- ... -->)
+  - Batch files                 (:: prefix)
+  - RST files                   (.. prefix)
+
+Respects the rat-plugin exclusion list from the root pom.xml.
+
+Usage:
+    python3 bin/fix-license-headers.py           # report issues only
+    python3 bin/fix-license-headers.py --fix     # report and fix issues
+    python3 bin/fix-license-headers.py --verbose # show per-file details
+"""
+
+import os
+import re
+import sys
+import fnmatch
+from collections import Counter
+
+REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+# ---------------------------------------------------------------------------
+# Canonical license text
+# ---------------------------------------------------------------------------
+
+# Lines of the license body with no comment prefix.
+CANONICAL_LINES = [
+    "Licensed to the Apache Software Foundation (ASF) under one",
+    "or more contributor license agreements.  See the NOTICE file",
+    "distributed with this work for additional information",
+    "regarding copyright ownership.  The ASF licenses this file",
+    "to you under the Apache License, Version 2.0 (the",
+    '"License"); you may not use this file except in compliance',
+    "with the License.  You may obtain a copy of the License at",
+    "",
+    "  http://www.apache.org/licenses/LICENSE-2.0";,
+    "",
+    "Unless required by applicable law or agreed to in writing,",
+    "software distributed under the License is distributed on an",
+    '"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY',
+    "KIND, either express or implied.  See the License for the",
+    "specific language governing permissions and limitations",
+    "under the License.",
+]
+
+# Full AsciiDoc block (content between //// delimiters, inclusive).
+CANONICAL_ASCIIDOC = """\
+////
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+////"""
+
+# Full HTML/XML comment block (content between <!-- and -->, inclusive).
+CANONICAL_HTML = """\
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->"""
+
+# Phrases that mark the end of any Apache license block (stripped of prefix).
+# The standard form ends with "under the License." on its own line; the
+# paragraph-wrapped form ends with "limitations under the License."
+LICENSE_END_PHRASES = ("under the License.", "limitations under the License.")
+
+# ---------------------------------------------------------------------------
+# Exclusion patterns (mirrors rat-plugin <excludes> in root pom.xml)
+# ---------------------------------------------------------------------------
+
+EXCLUDE_PATTERNS = [
+    ".mailmap", ".asf.yaml", ".travis.yml", ".travis.*.sh", ".dockerignore",
+    ".github/**",
+    "**/.classpath", "**/.project", "**/.settings/**", "**/.idea/**",
+    ".repository/**", "**/target/**",
+    "data/*.txt",
+    "**/bin/gremlin.sh", "gremlin-console/bin/gremlin.sh",
+    "docs/static/**", "docs/original/**", "docs/site/home/css/**", 
"docs/site/home/js/**",
+    "docs/gremlint/build/**", "docs/gremlint/public/CNAME",
+    "**/AGENTS.md",
+    "**/*.kryo", "**/*.gbin", "**/*.iml", "**/*.json", "**/*.xml",
+    "**/*.ldjson", "**/*.graffle", "**/*.svg", "**/*.trx", "**/*.sln",
+    "**/*.user", "**/*.csproj", "**/*.nuspec",
+    "**/goal.txt",
+    "**/src/main/resources/META-INF/services/**",
+    "**/src/test/resources/mockito-extensions/**",
+    "**/src/test/resources/META-INF/services/**",
+    "**/src/test/resources/cucumber.properties",
+    "**/src/test/resources/incorrect-traversals.txt",
+    
"**/src/test/resources/org/apache/tinkerpop/gremlin/console/groovy/plugin/script-customizer-*.groovy",
+    
"**/src/test/resources/org/apache/tinkerpop/gremlin/jsr223/script-customizer-*.groovy",
+    
"**/src/test/resources/org/apache/tinkerpop/gremlin/console/jsr223/script-customizer-*.groovy",
+    
"**/src/main/resources/org/apache/tinkerpop/gremlin/structure/io/script/*.txt",
+    "**/src/main/ext/**", "**/src/main/static/**",
+    "**/_bsp/**",
+    "DEPENDENCIES", "**/.glv",
+    "**/Debug/**", "**/Release/**", "**/obj/**",
+    "**/.vs/**", "**/NuGet.Config", "**/BenchmarkDotNet.Artifacts/**",
+    "**/.nvmrc", "**/.yarnrc.yml", "**/yarn.lock",
+    "**/node/**", "**/node_modules/**", "**/npm-debug.log",
+    "**/build/**", "**/doc/**", "**/lib/**",
+    "**/.env", "**/.prettierrc", "**/_site/**",
+    "**/.pytest_cache/**", "**/venv/**", "**/.venv/**", "**/.eggs/**",
+    "**/gremlinpython.egg-info/**", "**/docfx/**",
+    "**/go.sum", "**/coverage.out", "**/gremlinconsoletest.egg-info/**",
+]
+
+# Directories never descended into (faster than pattern matching every file).
+PRUNE_DIRS = {'.git', 'target', 'node_modules', 'build', 'venv', '.venv',
+              '.eggs', 'doc', 'lib', 'Debug', 'Release', 'obj', 'docfx',
+              '_site', '__pycache__', '.pytest_cache', 
'BenchmarkDotNet.Artifacts'}
+
+# ---------------------------------------------------------------------------
+# Pattern matching
+# ---------------------------------------------------------------------------
+
+def matches_exclude_pattern(rel_path, patterns):
+    rel_path = rel_path.replace(os.sep, "/")
+    for pattern in patterns:
+        pattern = pattern.replace(os.sep, "/")
+        if "/" not in pattern:
+            if fnmatch.fnmatch(os.path.basename(rel_path), pattern):
+                return True
+        elif "**" in pattern:
+            if pattern.startswith("**/"):
+                inner = pattern[3:]
+                parts = rel_path.split("/")
+                for i in range(len(parts)):
+                    if fnmatch.fnmatch("/".join(parts[i:]), inner):
+                        return True
+            if fnmatch.fnmatch(rel_path, pattern):
+                return True
+        else:
+            if fnmatch.fnmatch(rel_path, pattern) or 
fnmatch.fnmatch(os.path.basename(rel_path), pattern):
+                return True
+    return False
+
+# ---------------------------------------------------------------------------
+# AsciiDoc handling  (//// ... ////)
+# ---------------------------------------------------------------------------
+
+def process_asciidoc(filepath, fix):
+    with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
+        content = f.read()
+
+    if 'Licensed to the Apache Software Foundation' not in content:
+        return 'no_license', []
+
+    lines = content.splitlines(keepends=True)
+
+    first = next((i for i, l in enumerate(lines) if l.rstrip('\r\n') == 
'////'), None)
+    if first is None:
+        return 'unparseable', ['no opening //// delimiter found']
+
+    second = next((i for i in range(first + 1, len(lines)) if 
lines[i].rstrip('\r\n') == '////'), None)
+    if second is None:
+        return 'unparseable', ['no closing //// delimiter found']
+
+    body = '\n'.join(l.rstrip('\r\n') for l in lines[first:second + 1])
+    if body == CANONICAL_ASCIIDOC:
+        return 'ok', []
+
+    if fix:
+        new_content = CANONICAL_ASCIIDOC + '\n' + ''.join(lines[second + 1:])
+        with open(filepath, 'w', encoding='utf-8') as f:
+            f.write(new_content)
+        return 'fixed', ['license block replaced with canonical form']
+
+    return 'has_issues', ['license block does not match canonical form']
+
+# ---------------------------------------------------------------------------
+# HTML/XML comment handling  (<!-- ... -->)
+# ---------------------------------------------------------------------------
+
+def process_html_comment(filepath, fix, lines, open_idx):
+    """Handle a <!-- --> comment block starting at open_idx."""
+    close_idx = next(
+        (i for i in range(open_idx + 1, len(lines)) if lines[i].rstrip('\r\n') 
== '-->'),
+        None
+    )
+    if close_idx is None:
+        return 'unparseable', ['no closing --> delimiter found']
+
+    body = '\n'.join(l.rstrip('\r\n') for l in lines[open_idx:close_idx + 1])
+    if body == CANONICAL_HTML:
+        return 'ok', []
+
+    if fix:
+        new_content = (
+            ''.join(lines[:open_idx])
+            + CANONICAL_HTML + '\n'
+            + ''.join(lines[close_idx + 1:])
+        )
+        with open(filepath, 'w', encoding='utf-8') as f:
+            f.write(new_content)
+        return 'fixed', ['html/xml comment block replaced with canonical form']
+
+    return 'has_issues', ['html/xml comment block does not match canonical 
form']
+
+# ---------------------------------------------------------------------------
+# Generic comment-style handling
+# ---------------------------------------------------------------------------
+
+def detect_comment_style(lines, start_idx):
+    """
+    Detect the comment style from the line at start_idx.
+
+    Returns (style, base_prefix).  Styles:
+      block_star   — ' * ' prefix  (Java/JS/C# block comments)
+      double_slash — '// ' prefix
+      hash         — '# ' prefix
+      double_colon — ':: ' prefix  (batch files)
+      double_dot   — '.. ' prefix  (RST)
+      plain        — no prefix
+    """
+    line = lines[start_idx].rstrip('\n').rstrip('\r')
+
+    m = re.match(r'^(\s*\*\s+)', line)
+    if m:
+        return 'block_star', m.group(1)
+
+    m = re.match(r'^(//\s*)', line)
+    if m:
+        return 'double_slash', m.group(1)
+
+    m = re.match(r'^(::\s*)', line)
+    if m:
+        return 'double_colon', m.group(1)
+
+    m = re.match(r'^(\.\.\s+)', line)
+    if m:
+        return 'double_dot', m.group(1)
+
+    m = re.match(r'^(#\s*)', line)
+    if m:
+        first_prefix = m.group(1)
+        prefixes_seen = [first_prefix]
+        for j in range(start_idx + 1, min(start_idx + 10, len(lines))):
+            next_line = lines[j].rstrip('\n').rstrip('\r')
+            if next_line.rstrip() == '#':
+                continue
+            m2 = re.match(r'^(#\s*)', next_line)
+            if m2 and next_line.strip():
+                prefixes_seen.append(m2.group(1))
+        prefix_counter = Counter(prefixes_seen)
+        return 'hash', prefix_counter.most_common(1)[0][0]
+
+    m = re.match(r'^(\s+)', line)
+    if m and line.strip().startswith('Licensed'):
+        return 'space_indent', m.group(1)
+
+    return 'plain', ''
+
+
+def get_line_content(line, style, base_prefix):
+    """Strip the comment prefix and return the bare content of a line."""
+    raw = line.rstrip('\n').rstrip('\r')
+
+    if style == 'block_star':
+        m = re.match(r'^(\s*\*)(.*)', raw)
+        if m:
+            full = m.group(1) + m.group(2)
+            return full[len(base_prefix):].rstrip() if len(full) >= 
len(base_prefix) else ""
+        return raw.rstrip()
+
+    if style in ('double_slash', 'hash'):
+        char = '//' if style == 'double_slash' else '#'
+        m = re.match(r'^(' + re.escape(char) + r')(.*)', raw)
+        if m:
+            full = char + m.group(2)
+            return full[len(base_prefix):].rstrip() if len(full) >= 
len(base_prefix) else ""
+        return raw.strip()
+
+    if style == 'double_colon':
+        if raw.rstrip() == '::':
+            return ""
+        m = re.match(r'^(::\s*)(.*)', raw)
+        if m:
+            full = '::' + raw[2:]
+            return full[len(base_prefix):].rstrip() if len(full) >= 
len(base_prefix) else m.group(2).rstrip()
+        return raw.strip()
+
+    if style == 'double_dot':
+        if raw.rstrip() == '..':
+            return ""
+        if not raw.strip():
+            return ""
+        full = '..' + raw[2:]
+        return full[len(base_prefix):].rstrip() if len(full) >= 
len(base_prefix) else raw.strip()
+
+    if style == 'space_indent':
+        if not raw.strip():
+            return ""
+        return raw[len(base_prefix):].rstrip() if len(raw) >= len(base_prefix) 
else raw.rstrip()
+
+    return raw.rstrip()  # plain
+
+
+def find_license_block(lines, style, base_prefix):
+    """
+    Locate start and end line indices of the license block.
+
+    Accepts both the standard ending ('under the License.') and the
+    paragraph-wrapped ending ('limitations under the License.').
+    """
+    start_idx = next(
+        (i for i, l in enumerate(lines) if 'Licensed to the Apache Software 
Foundation' in l),
+        None
+    )
+    if start_idx is None:
+        return None
+
+    end_idx = None
+    for i in range(start_idx, min(start_idx + 55, len(lines))):
+        content = get_line_content(lines[i], style, base_prefix).rstrip()
+        if content in LICENSE_END_PHRASES:
+            end_idx = i
+            break
+
+    if end_idx is None:
+        return None
+
+    num_lines = end_idx - start_idx + 1
+    if not (14 <= num_lines <= 22):
+        return None
+
+    return start_idx, end_idx
+
+
+def reconstruct_license_lines(style, base_prefix):
+    """Build the corrected license block lines (with newlines)."""
+    new_lines = []
+    for canonical in CANONICAL_LINES:
+        if canonical == "":
+            if style == 'block_star':
+                blank = re.match(r'^(\s*\*)', base_prefix)
+                new_lines.append((blank.group(1) if blank else "") + "\n")
+            elif style == 'double_slash':
+                new_lines.append("//\n")
+            elif style == 'hash':
+                new_lines.append("#\n")
+            elif style == 'double_colon':
+                new_lines.append("::\n")
+            elif style == 'double_dot':
+                new_lines.append("..\n")
+            else:
+                new_lines.append("\n")
+        else:
+            new_lines.append((base_prefix if style != 'plain' else "") + 
canonical + "\n")
+    return new_lines
+
+
+def process_generic(filepath, fix):
+    try:
+        with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
+            content = f.read()
+    except Exception as e:
+        return 'error', [str(e)]
+
+    if 'Licensed to the Apache Software Foundation' not in content:
+        return 'no_license', []
+
+    lines = content.splitlines(keepends=True)
+
+    licensed_idx = next(
+        (i for i, l in enumerate(lines) if 'Licensed to the Apache Software 
Foundation' in l),
+        None
+    )
+    if licensed_idx is None:
+        return 'no_license', []
+
+    # Check for HTML/XML comment block (<!-- on the line before Licensed)
+    if licensed_idx > 0 and lines[licensed_idx - 1].rstrip('\r\n') == '<!--':
+        return process_html_comment(filepath, fix, lines, licensed_idx - 1)
+
+    style, base_prefix = detect_comment_style(lines, licensed_idx)
+    result = find_license_block(lines, style, base_prefix)
+    if result is None:
+        return 'unparseable', ['could not locate complete license block']
+
+    start_idx, end_idx = result
+    extracted = [get_line_content(lines[i], style, base_prefix) for i in 
range(start_idx, end_idx + 1)]
+
+    mismatches = []
+    if len(extracted) != len(CANONICAL_LINES):
+        mismatches.append(f"line count: got {len(extracted)}, expected 
{len(CANONICAL_LINES)}")
+    for i in range(min(len(extracted), len(CANONICAL_LINES))):
+        if extracted[i] != CANONICAL_LINES[i]:
+            mismatches.append(f"line {i}: got {repr(extracted[i])}, expected 
{repr(CANONICAL_LINES[i])}")
+
+    if not mismatches:
+        return 'ok', []
+
+    if fix:
+        try:
+            new_lines = (
+                lines[:start_idx]
+                + reconstruct_license_lines(style, base_prefix)
+                + lines[end_idx + 1:]
+            )
+            with open(filepath, 'w', encoding='utf-8', errors='replace') as f:
+                f.writelines(new_lines)
+            return 'fixed', mismatches
+        except Exception as e:
+            return 'error', [f"fix failed: {e}"]
+
+    return 'has_issues', mismatches
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+    fix_mode = '--fix' in sys.argv
+    verbose_mode = '--verbose' in sys.argv
+
+    print(f"Repository: {REPO_ROOT}")
+    print(f"Mode:       {'fix' if fix_mode else 'check only'}")
+    print()
+
+    stats = Counter()
+    problems = {}    # rel_path -> list of issue strings
+    unparseable = []
+
+    for dirpath, dirnames, filenames in os.walk(REPO_ROOT):
+        dirnames[:] = sorted(d for d in dirnames if d not in PRUNE_DIRS)
+
+        for filename in filenames:
+            filepath = os.path.join(dirpath, filename)
+            rel_path = os.path.relpath(filepath, REPO_ROOT).replace(os.sep, 
'/')
+
+            if matches_exclude_pattern(rel_path, EXCLUDE_PATTERNS):
+                continue
+
+            if filename.endswith('.asciidoc'):
+                status, issues = process_asciidoc(filepath, fix_mode)
+            else:
+                status, issues = process_generic(filepath, fix_mode)
+
+            stats[status] += 1
+
+            if status == 'has_issues':
+                problems[rel_path] = issues
+            elif status == 'unparseable':
+                unparseable.append((rel_path, issues))
+            elif status in ('fixed', 'ok') and verbose_mode:
+                print(f"{'FIXED' if status == 'fixed' else 'OK   '} 
{rel_path}")
+
+    total = sum(stats.values())
+    print(f"Files scanned:    {total}")
+    print(f"  No license:     {stats['no_license']}")
+    print(f"  OK:             {stats['ok']}")
+    print(f"  Fixed:          {stats['fixed']}")
+    print(f"  Has issues:     {stats['has_issues']}")
+    print(f"  Unparseable:    {stats['unparseable']}")
+    print(f"  Errors:         {stats['error']}")
+
+    if problems:
+        print(f"\n=== FILES WITH LICENSE ISSUES ({len(problems)}) ===")
+        for rel_path, issues in sorted(problems.items()):
+            print(f"\n  {rel_path}:")
+            for issue in issues[:5]:
+                print(f"    {issue}")
+
+    if unparseable:
+        print(f"\n=== UNPARSEABLE LICENSE BLOCKS ({len(unparseable)}) ===")
+        for rel_path, issues in unparseable:
+            print(f"  {rel_path}: {issues[0]}")
+
+    return len(problems)
+
+
+if __name__ == '__main__':
+    sys.exit(main())

(tinkerpop) branch master updated: Added license header fix utility CTR

Reply via email to