The linter in contrib/ checks all messages that the GCC user may see whether they conform roughly to the GCC Guidelines for Diagnostics.
* contrib/check-internal-format-escaping.py: improve output of the linter by providing short rationales for the warnings; detect space before punctuation; temporarily disable the most frequent diagnostics, in order to first focus on the ones that can be fixed more quickly
Index: contrib/check-internal-format-escaping.py IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- contrib/check-internal-format-escaping.py (revision c7bf741f801cfbdf56bf1ee11a74ca0abe1e8c16) +++ contrib/check-internal-format-escaping.py (date 1557864788799) @@ -41,18 +41,19 @@ diagnostic_id: str, diagnostic: str, include_msgid=True): """ To suppress a warning for a particular message, - add a line "#, gcclint:ignore:{diagnostic_id}" to the message. + add a comment "# gcclint:ignore:{diagnostic_id}" to the message. """ - if f'gcclint:ignore:{diagnostic_id}' in msg.flags: + marker = f'gcclint:ignore:{diagnostic_id}' + if marker in msg.comment: return - seen_warnings[diagnostic] += 1 + seen_warnings[diagnostic_id] += 1 if include_msgid: - print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}') + print(f'{location(msg)}: {diagnostic} in {msg.msgid!r} [{marker}]') else: - print(f'{location(msg)}: {diagnostic}') + print(f'{location(msg)}: {diagnostic} [{marker}]') def lint_gcc_internal_format(msg: polib.POEntry): @@ -68,6 +69,9 @@ before = msgid[:m.start(0)] return before.count("%<") == before.count("%>") + def find_outside_quotes(regex: str, s: str): + return filter(outside_quotes, re.finditer(regex, s)) + def lint_matching_placeholders(): """ Warns when literal values in placeholders are not exactly equal @@ -89,36 +93,31 @@ in_msgstr = re.findall('%<[^%]+%>', msg.msgstr) if set(in_msgid) != set(in_msgstr): - warn(msg, - 'placeholder-mismatch', + warn(msg, 'placeholder-mismatch', f'placeholder mismatch: msgid has {in_msgid}, ' f'msgstr has {in_msgstr}', include_msgid=False) def lint_option_outside_quotes(): - for match in re.finditer(r'\S+', msgid): + for match in find_outside_quotes(r'\S+', msgid): part = match.group() - if not outside_quotes(match): - continue - if part.startswith('-'): - if len(part) >= 2 and part[1].isalpha(): - if part == '-INF': - continue - - warn(msg, - 'option-outside-quotes', - 'command line option outside %<quotes%>') + if len(part) >= 2 and part[0] == '-' \ + and part[1].isalpha() and part != '-INF': + warn(msg, 'option-outside-quotes', + 'command line options ' + 'should be enclosed in %<quotes%>') if part.startswith('__builtin_'): - warn(msg, - 'builtin-outside-quotes', - 'builtin function outside %<quotes%>') + warn(msg, 'builtin-outside-quotes', + 'names of builtin functions ' + 'should be enclosed in %<quotes%>') def lint_plain_apostrophe(): - for match in re.finditer("[^%]'", msgid): - if outside_quotes(match): - warn(msg, 'apostrophe', 'apostrophe without leading %') + if any(find_outside_quotes("[^%]'", msgid)): + warn(msg, 'apostrophe', + 'apostrophes should be written as %\' ' + 'to make them typographically correct') def lint_space_before_quote(): """ @@ -129,26 +128,45 @@ for match in re.finditer("(.?[a-zA-Z0-9])%<", msgid): if match.group(1) != '%s': - warn(msg, - 'no-space-before-quote', - '%< directly following a letter or digit') + warn(msg, 'space-before-quote', + 'a letter or digit should not be ' + 'directly followed by a %<') + + def lint_space_before_punctuation(): + """ + In written English, there is no space before a colon. + In text enclosed in %<quotes%> it's ok though. + """ + + for match in find_outside_quotes(" [.,:;?]", msgid): + warn(msg, 'space-before-punctuation', + f'there should be no space before punctuation ' + f'{match.group()!r}') def lint_underscore_outside_quotes(): """ An underscore outside of quotes is used in several contexts, - and many of them violate the GCC Guidelines for Diagnostics: + and several of them violate the GCC Guidelines for Diagnostics + by not speaking in terms of the language being compiled: * names of GCC-internal compiler functions * names of GCC-internal data structures - * static_cast and the like (which are legitimate) + + In some cases the underscore is used legitimately. + It could be quoted but often doesn't need to since it is + visually distinct enough from plain English text. Often + there are similar messages though, using keywords that are + valid English words, too. This can be confusing for both + GCC users and translators. + + * static_cast and related C++ casts + * x86_64 """ - for match in re.finditer("_", msgid): - if outside_quotes(match): - warn(msg, - 'underscore-outside-quotes', - 'underscore outside of %<quotes%>') - return + for _ in find_outside_quotes("_", msgid): + warn(msg, 'underscore-outside-quotes', + 'underscore outside of %<quotes%>') + return def lint_may_not(): """ @@ -158,31 +176,23 @@ """ if re.search(r'\bmay not\b', msgid): - warn(msg, - 'ambiguous-may-not', + warn(msg, 'ambiguous-may-not', 'the term "may not" is ambiguous') def lint_unbalanced_quotes(): if msgid.count("%<") != msgid.count("%>"): - warn(msg, - 'unbalanced-quotes', + warn(msg, 'unbalanced-quotes', 'unbalanced %< and %> quotes') if msg.translated(): if msg.msgstr.count("%<") != msg.msgstr.count("%>"): - warn(msg, - 'unbalanced-quotes', + warn(msg, 'unbalanced-quotes', 'unbalanced %< and %> quotes') def lint_single_space_after_sentence(): - """ - After a sentence there should be two spaces. - """ - if re.search(r'[.] [A-Z]', msgid): - warn(msg, - 'single-space-after-sentence', - 'single space after sentence') + warn(msg, 'single-space-after-sentence', + 'sentences should be ended using two spaces') def lint_non_canonical_quotes(): """ @@ -190,14 +200,14 @@ """ match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid) if match: - warn(msg, - 'non-canonical-quotes', + warn(msg, 'non-canonical-quotes', f'placeholder {match.group()} should be written as %qs') lint_option_outside_quotes() lint_plain_apostrophe() lint_space_before_quote() - lint_underscore_outside_quotes() + lint_space_before_punctuation() + False and lint_underscore_outside_quotes() lint_may_not() lint_unbalanced_quotes() lint_matching_placeholders() @@ -205,7 +215,7 @@ lint_non_canonical_quotes() -def lint_diagnostics_differing_only_in_placeholders(po: polib.POFile): +def lint_same_pattern(po: polib.POFile): """ Detects messages that are structurally the same, except that they use different plain strings inside %<quotes%>. These messages can @@ -230,8 +240,8 @@ prev = seen[normalized] warn(msg, 'same-pattern', - f'same pattern for {repr(msgid)} and ' - f'{repr(prev.msgid)} in {location(prev)}', + f'the message {msgid!r} is structurally equal to ' + f'{prev.msgid!r} in {location(prev)}', include_msgid=False) @@ -243,23 +253,27 @@ if 'gcc-internal-format' in msg.flags: lint_gcc_internal_format(msg) - lint_diagnostics_differing_only_in_placeholders(po) + False and lint_same_pattern(po) + + +def print_summary(): + print() + print('summary:') + for entry in seen_warnings.most_common(): + if entry[1] > 1: + print(f'{entry[1]:>5} {entry[0]}') def main(): parser = argparse.ArgumentParser(description='') - parser.add_argument('file', help='pot file') + parser.add_argument('file', help='pot or po file') args = parser.parse_args() po = polib.pofile(args.file) lint_file(po) - print() - print('summary:') - for entry in seen_warnings.most_common(): - if entry[1] > 1: - print(f'{entry[1]}\t{entry[0]}') + print_summary() if __name__ == '__main__':