This is an automated email from the ASF dual-hosted git repository.
spmallette pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tinkerpop.git
The following commit(s) were added to refs/heads/master by this push:
new fd2c4b4a1d Added license header fix utility CTR
fd2c4b4a1d is described below
commit fd2c4b4a1dc989ff1f90000f6117250e3785a6a1
Author: Stephen Mallette <[email protected]>
AuthorDate: Sat Mar 14 14:59:30 2026 -0400
Added license header fix utility CTR
---
bin/fix-license-headers.py | 526 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 526 insertions(+)
diff --git a/bin/fix-license-headers.py b/bin/fix-license-headers.py
new file mode 100755
index 0000000000..fc3ef04266
--- /dev/null
+++ b/bin/fix-license-headers.py
@@ -0,0 +1,526 @@
+#!/usr/bin/env python3
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+"""
+Validates and fixes ASF license headers across the TinkerPop repository.
+
+Handles all comment styles found in the repo:
+ - Java/Go/C# block comments (* prefix inside /* ... */)
+ - Double-slash (// prefix)
+ - Hash (# prefix)
+ - AsciiDoc block comment (content inside //// ... ////)
+ - HTML/XML block comment (content inside <!-- ... -->)
+ - Batch files (:: prefix)
+ - RST files (.. prefix)
+
+Respects the rat-plugin exclusion list from the root pom.xml.
+
+Usage:
+ python3 bin/fix-license-headers.py # report issues only
+ python3 bin/fix-license-headers.py --fix # report and fix issues
+ python3 bin/fix-license-headers.py --verbose # show per-file details
+"""
+
+import os
+import re
+import sys
+import fnmatch
+from collections import Counter
+
+REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+# ---------------------------------------------------------------------------
+# Canonical license text
+# ---------------------------------------------------------------------------
+
+# Lines of the license body with no comment prefix.
+CANONICAL_LINES = [
+ "Licensed to the Apache Software Foundation (ASF) under one",
+ "or more contributor license agreements. See the NOTICE file",
+ "distributed with this work for additional information",
+ "regarding copyright ownership. The ASF licenses this file",
+ "to you under the Apache License, Version 2.0 (the",
+ '"License"); you may not use this file except in compliance',
+ "with the License. You may obtain a copy of the License at",
+ "",
+ " http://www.apache.org/licenses/LICENSE-2.0",
+ "",
+ "Unless required by applicable law or agreed to in writing,",
+ "software distributed under the License is distributed on an",
+ '"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY',
+ "KIND, either express or implied. See the License for the",
+ "specific language governing permissions and limitations",
+ "under the License.",
+]
+
+# Full AsciiDoc block (content between //// delimiters, inclusive).
+CANONICAL_ASCIIDOC = """\
+////
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+////"""
+
+# Full HTML/XML comment block (content between <!-- and -->, inclusive).
+CANONICAL_HTML = """\
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->"""
+
+# Phrases that mark the end of any Apache license block (stripped of prefix).
+# The standard form ends with "under the License." on its own line; the
+# paragraph-wrapped form ends with "limitations under the License."
+LICENSE_END_PHRASES = ("under the License.", "limitations under the License.")
+
+# ---------------------------------------------------------------------------
+# Exclusion patterns (mirrors rat-plugin <excludes> in root pom.xml)
+# ---------------------------------------------------------------------------
+
+EXCLUDE_PATTERNS = [
+ ".mailmap", ".asf.yaml", ".travis.yml", ".travis.*.sh", ".dockerignore",
+ ".github/**",
+ "**/.classpath", "**/.project", "**/.settings/**", "**/.idea/**",
+ ".repository/**", "**/target/**",
+ "data/*.txt",
+ "**/bin/gremlin.sh", "gremlin-console/bin/gremlin.sh",
+ "docs/static/**", "docs/original/**", "docs/site/home/css/**",
"docs/site/home/js/**",
+ "docs/gremlint/build/**", "docs/gremlint/public/CNAME",
+ "**/AGENTS.md",
+ "**/*.kryo", "**/*.gbin", "**/*.iml", "**/*.json", "**/*.xml",
+ "**/*.ldjson", "**/*.graffle", "**/*.svg", "**/*.trx", "**/*.sln",
+ "**/*.user", "**/*.csproj", "**/*.nuspec",
+ "**/goal.txt",
+ "**/src/main/resources/META-INF/services/**",
+ "**/src/test/resources/mockito-extensions/**",
+ "**/src/test/resources/META-INF/services/**",
+ "**/src/test/resources/cucumber.properties",
+ "**/src/test/resources/incorrect-traversals.txt",
+
"**/src/test/resources/org/apache/tinkerpop/gremlin/console/groovy/plugin/script-customizer-*.groovy",
+
"**/src/test/resources/org/apache/tinkerpop/gremlin/jsr223/script-customizer-*.groovy",
+
"**/src/test/resources/org/apache/tinkerpop/gremlin/console/jsr223/script-customizer-*.groovy",
+
"**/src/main/resources/org/apache/tinkerpop/gremlin/structure/io/script/*.txt",
+ "**/src/main/ext/**", "**/src/main/static/**",
+ "**/_bsp/**",
+ "DEPENDENCIES", "**/.glv",
+ "**/Debug/**", "**/Release/**", "**/obj/**",
+ "**/.vs/**", "**/NuGet.Config", "**/BenchmarkDotNet.Artifacts/**",
+ "**/.nvmrc", "**/.yarnrc.yml", "**/yarn.lock",
+ "**/node/**", "**/node_modules/**", "**/npm-debug.log",
+ "**/build/**", "**/doc/**", "**/lib/**",
+ "**/.env", "**/.prettierrc", "**/_site/**",
+ "**/.pytest_cache/**", "**/venv/**", "**/.venv/**", "**/.eggs/**",
+ "**/gremlinpython.egg-info/**", "**/docfx/**",
+ "**/go.sum", "**/coverage.out", "**/gremlinconsoletest.egg-info/**",
+]
+
+# Directories never descended into (faster than pattern matching every file).
+PRUNE_DIRS = {'.git', 'target', 'node_modules', 'build', 'venv', '.venv',
+ '.eggs', 'doc', 'lib', 'Debug', 'Release', 'obj', 'docfx',
+ '_site', '__pycache__', '.pytest_cache',
'BenchmarkDotNet.Artifacts'}
+
+# ---------------------------------------------------------------------------
+# Pattern matching
+# ---------------------------------------------------------------------------
+
+def matches_exclude_pattern(rel_path, patterns):
+ rel_path = rel_path.replace(os.sep, "/")
+ for pattern in patterns:
+ pattern = pattern.replace(os.sep, "/")
+ if "/" not in pattern:
+ if fnmatch.fnmatch(os.path.basename(rel_path), pattern):
+ return True
+ elif "**" in pattern:
+ if pattern.startswith("**/"):
+ inner = pattern[3:]
+ parts = rel_path.split("/")
+ for i in range(len(parts)):
+ if fnmatch.fnmatch("/".join(parts[i:]), inner):
+ return True
+ if fnmatch.fnmatch(rel_path, pattern):
+ return True
+ else:
+ if fnmatch.fnmatch(rel_path, pattern) or
fnmatch.fnmatch(os.path.basename(rel_path), pattern):
+ return True
+ return False
+
+# ---------------------------------------------------------------------------
+# AsciiDoc handling (//// ... ////)
+# ---------------------------------------------------------------------------
+
+def process_asciidoc(filepath, fix):
+ with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
+ content = f.read()
+
+ if 'Licensed to the Apache Software Foundation' not in content:
+ return 'no_license', []
+
+ lines = content.splitlines(keepends=True)
+
+ first = next((i for i, l in enumerate(lines) if l.rstrip('\r\n') ==
'////'), None)
+ if first is None:
+ return 'unparseable', ['no opening //// delimiter found']
+
+ second = next((i for i in range(first + 1, len(lines)) if
lines[i].rstrip('\r\n') == '////'), None)
+ if second is None:
+ return 'unparseable', ['no closing //// delimiter found']
+
+ body = '\n'.join(l.rstrip('\r\n') for l in lines[first:second + 1])
+ if body == CANONICAL_ASCIIDOC:
+ return 'ok', []
+
+ if fix:
+ new_content = CANONICAL_ASCIIDOC + '\n' + ''.join(lines[second + 1:])
+ with open(filepath, 'w', encoding='utf-8') as f:
+ f.write(new_content)
+ return 'fixed', ['license block replaced with canonical form']
+
+ return 'has_issues', ['license block does not match canonical form']
+
+# ---------------------------------------------------------------------------
+# HTML/XML comment handling (<!-- ... -->)
+# ---------------------------------------------------------------------------
+
+def process_html_comment(filepath, fix, lines, open_idx):
+ """Handle a <!-- --> comment block starting at open_idx."""
+ close_idx = next(
+ (i for i in range(open_idx + 1, len(lines)) if lines[i].rstrip('\r\n')
== '-->'),
+ None
+ )
+ if close_idx is None:
+ return 'unparseable', ['no closing --> delimiter found']
+
+ body = '\n'.join(l.rstrip('\r\n') for l in lines[open_idx:close_idx + 1])
+ if body == CANONICAL_HTML:
+ return 'ok', []
+
+ if fix:
+ new_content = (
+ ''.join(lines[:open_idx])
+ + CANONICAL_HTML + '\n'
+ + ''.join(lines[close_idx + 1:])
+ )
+ with open(filepath, 'w', encoding='utf-8') as f:
+ f.write(new_content)
+ return 'fixed', ['html/xml comment block replaced with canonical form']
+
+ return 'has_issues', ['html/xml comment block does not match canonical
form']
+
+# ---------------------------------------------------------------------------
+# Generic comment-style handling
+# ---------------------------------------------------------------------------
+
+def detect_comment_style(lines, start_idx):
+ """
+ Detect the comment style from the line at start_idx.
+
+ Returns (style, base_prefix). Styles:
+ block_star — ' * ' prefix (Java/JS/C# block comments)
+ double_slash — '// ' prefix
+ hash — '# ' prefix
+ double_colon — ':: ' prefix (batch files)
+ double_dot — '.. ' prefix (RST)
+ plain — no prefix
+ """
+ line = lines[start_idx].rstrip('\n').rstrip('\r')
+
+ m = re.match(r'^(\s*\*\s+)', line)
+ if m:
+ return 'block_star', m.group(1)
+
+ m = re.match(r'^(//\s*)', line)
+ if m:
+ return 'double_slash', m.group(1)
+
+ m = re.match(r'^(::\s*)', line)
+ if m:
+ return 'double_colon', m.group(1)
+
+ m = re.match(r'^(\.\.\s+)', line)
+ if m:
+ return 'double_dot', m.group(1)
+
+ m = re.match(r'^(#\s*)', line)
+ if m:
+ first_prefix = m.group(1)
+ prefixes_seen = [first_prefix]
+ for j in range(start_idx + 1, min(start_idx + 10, len(lines))):
+ next_line = lines[j].rstrip('\n').rstrip('\r')
+ if next_line.rstrip() == '#':
+ continue
+ m2 = re.match(r'^(#\s*)', next_line)
+ if m2 and next_line.strip():
+ prefixes_seen.append(m2.group(1))
+ prefix_counter = Counter(prefixes_seen)
+ return 'hash', prefix_counter.most_common(1)[0][0]
+
+ m = re.match(r'^(\s+)', line)
+ if m and line.strip().startswith('Licensed'):
+ return 'space_indent', m.group(1)
+
+ return 'plain', ''
+
+
+def get_line_content(line, style, base_prefix):
+ """Strip the comment prefix and return the bare content of a line."""
+ raw = line.rstrip('\n').rstrip('\r')
+
+ if style == 'block_star':
+ m = re.match(r'^(\s*\*)(.*)', raw)
+ if m:
+ full = m.group(1) + m.group(2)
+ return full[len(base_prefix):].rstrip() if len(full) >=
len(base_prefix) else ""
+ return raw.rstrip()
+
+ if style in ('double_slash', 'hash'):
+ char = '//' if style == 'double_slash' else '#'
+ m = re.match(r'^(' + re.escape(char) + r')(.*)', raw)
+ if m:
+ full = char + m.group(2)
+ return full[len(base_prefix):].rstrip() if len(full) >=
len(base_prefix) else ""
+ return raw.strip()
+
+ if style == 'double_colon':
+ if raw.rstrip() == '::':
+ return ""
+ m = re.match(r'^(::\s*)(.*)', raw)
+ if m:
+ full = '::' + raw[2:]
+ return full[len(base_prefix):].rstrip() if len(full) >=
len(base_prefix) else m.group(2).rstrip()
+ return raw.strip()
+
+ if style == 'double_dot':
+ if raw.rstrip() == '..':
+ return ""
+ if not raw.strip():
+ return ""
+ full = '..' + raw[2:]
+ return full[len(base_prefix):].rstrip() if len(full) >=
len(base_prefix) else raw.strip()
+
+ if style == 'space_indent':
+ if not raw.strip():
+ return ""
+ return raw[len(base_prefix):].rstrip() if len(raw) >= len(base_prefix)
else raw.rstrip()
+
+ return raw.rstrip() # plain
+
+
+def find_license_block(lines, style, base_prefix):
+ """
+ Locate start and end line indices of the license block.
+
+ Accepts both the standard ending ('under the License.') and the
+ paragraph-wrapped ending ('limitations under the License.').
+ """
+ start_idx = next(
+ (i for i, l in enumerate(lines) if 'Licensed to the Apache Software
Foundation' in l),
+ None
+ )
+ if start_idx is None:
+ return None
+
+ end_idx = None
+ for i in range(start_idx, min(start_idx + 55, len(lines))):
+ content = get_line_content(lines[i], style, base_prefix).rstrip()
+ if content in LICENSE_END_PHRASES:
+ end_idx = i
+ break
+
+ if end_idx is None:
+ return None
+
+ num_lines = end_idx - start_idx + 1
+ if not (14 <= num_lines <= 22):
+ return None
+
+ return start_idx, end_idx
+
+
+def reconstruct_license_lines(style, base_prefix):
+ """Build the corrected license block lines (with newlines)."""
+ new_lines = []
+ for canonical in CANONICAL_LINES:
+ if canonical == "":
+ if style == 'block_star':
+ blank = re.match(r'^(\s*\*)', base_prefix)
+ new_lines.append((blank.group(1) if blank else "") + "\n")
+ elif style == 'double_slash':
+ new_lines.append("//\n")
+ elif style == 'hash':
+ new_lines.append("#\n")
+ elif style == 'double_colon':
+ new_lines.append("::\n")
+ elif style == 'double_dot':
+ new_lines.append("..\n")
+ else:
+ new_lines.append("\n")
+ else:
+ new_lines.append((base_prefix if style != 'plain' else "") +
canonical + "\n")
+ return new_lines
+
+
+def process_generic(filepath, fix):
+ try:
+ with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
+ content = f.read()
+ except Exception as e:
+ return 'error', [str(e)]
+
+ if 'Licensed to the Apache Software Foundation' not in content:
+ return 'no_license', []
+
+ lines = content.splitlines(keepends=True)
+
+ licensed_idx = next(
+ (i for i, l in enumerate(lines) if 'Licensed to the Apache Software
Foundation' in l),
+ None
+ )
+ if licensed_idx is None:
+ return 'no_license', []
+
+ # Check for HTML/XML comment block (<!-- on the line before Licensed)
+ if licensed_idx > 0 and lines[licensed_idx - 1].rstrip('\r\n') == '<!--':
+ return process_html_comment(filepath, fix, lines, licensed_idx - 1)
+
+ style, base_prefix = detect_comment_style(lines, licensed_idx)
+ result = find_license_block(lines, style, base_prefix)
+ if result is None:
+ return 'unparseable', ['could not locate complete license block']
+
+ start_idx, end_idx = result
+ extracted = [get_line_content(lines[i], style, base_prefix) for i in
range(start_idx, end_idx + 1)]
+
+ mismatches = []
+ if len(extracted) != len(CANONICAL_LINES):
+ mismatches.append(f"line count: got {len(extracted)}, expected
{len(CANONICAL_LINES)}")
+ for i in range(min(len(extracted), len(CANONICAL_LINES))):
+ if extracted[i] != CANONICAL_LINES[i]:
+ mismatches.append(f"line {i}: got {repr(extracted[i])}, expected
{repr(CANONICAL_LINES[i])}")
+
+ if not mismatches:
+ return 'ok', []
+
+ if fix:
+ try:
+ new_lines = (
+ lines[:start_idx]
+ + reconstruct_license_lines(style, base_prefix)
+ + lines[end_idx + 1:]
+ )
+ with open(filepath, 'w', encoding='utf-8', errors='replace') as f:
+ f.writelines(new_lines)
+ return 'fixed', mismatches
+ except Exception as e:
+ return 'error', [f"fix failed: {e}"]
+
+ return 'has_issues', mismatches
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+ fix_mode = '--fix' in sys.argv
+ verbose_mode = '--verbose' in sys.argv
+
+ print(f"Repository: {REPO_ROOT}")
+ print(f"Mode: {'fix' if fix_mode else 'check only'}")
+ print()
+
+ stats = Counter()
+ problems = {} # rel_path -> list of issue strings
+ unparseable = []
+
+ for dirpath, dirnames, filenames in os.walk(REPO_ROOT):
+ dirnames[:] = sorted(d for d in dirnames if d not in PRUNE_DIRS)
+
+ for filename in filenames:
+ filepath = os.path.join(dirpath, filename)
+ rel_path = os.path.relpath(filepath, REPO_ROOT).replace(os.sep,
'/')
+
+ if matches_exclude_pattern(rel_path, EXCLUDE_PATTERNS):
+ continue
+
+ if filename.endswith('.asciidoc'):
+ status, issues = process_asciidoc(filepath, fix_mode)
+ else:
+ status, issues = process_generic(filepath, fix_mode)
+
+ stats[status] += 1
+
+ if status == 'has_issues':
+ problems[rel_path] = issues
+ elif status == 'unparseable':
+ unparseable.append((rel_path, issues))
+ elif status in ('fixed', 'ok') and verbose_mode:
+ print(f"{'FIXED' if status == 'fixed' else 'OK '}
{rel_path}")
+
+ total = sum(stats.values())
+ print(f"Files scanned: {total}")
+ print(f" No license: {stats['no_license']}")
+ print(f" OK: {stats['ok']}")
+ print(f" Fixed: {stats['fixed']}")
+ print(f" Has issues: {stats['has_issues']}")
+ print(f" Unparseable: {stats['unparseable']}")
+ print(f" Errors: {stats['error']}")
+
+ if problems:
+ print(f"\n=== FILES WITH LICENSE ISSUES ({len(problems)}) ===")
+ for rel_path, issues in sorted(problems.items()):
+ print(f"\n {rel_path}:")
+ for issue in issues[:5]:
+ print(f" {issue}")
+
+ if unparseable:
+ print(f"\n=== UNPARSEABLE LICENSE BLOCKS ({len(unparseable)}) ===")
+ for rel_path, issues in unparseable:
+ print(f" {rel_path}: {issues[0]}")
+
+ return len(problems)
+
+
+if __name__ == '__main__':
+ sys.exit(main())