This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 4d6d7d5 ARROW-4312: [C++] Only run 2 * os.cpu_count() clang-format
instances at once
4d6d7d5 is described below
commit 4d6d7d50835a8f634aae040c75f10f374790a781
Author: Benjamin Kietzman <[email protected]>
AuthorDate: Tue Jan 22 19:21:36 2019 -0600
ARROW-4312: [C++] Only run 2 * os.cpu_count() clang-format instances at once
Author: Benjamin Kietzman <[email protected]>
Closes #3447 from bkietz/ARROW-4312-run-clang-format-in-batches and
squashes the following commits:
2a51818f2 <Benjamin Kietzman> launch only cpu_count()*2 parallel processes
0f530760a <Benjamin Kietzman> only run 16**2 clang-format instances at once
---
cpp/build-support/lintutils.py | 59 +++++++++++++-----------------
cpp/build-support/run_clang_format.py | 14 +++++---
cpp/build-support/run_clang_tidy.py | 67 +++++++++++++++++------------------
cpp/build-support/run_cpplint.py | 3 +-
4 files changed, 68 insertions(+), 75 deletions(-)
diff --git a/cpp/build-support/lintutils.py b/cpp/build-support/lintutils.py
index 1657cf2..0a54daa 100644
--- a/cpp/build-support/lintutils.py
+++ b/cpp/build-support/lintutils.py
@@ -17,9 +17,7 @@
import os
from fnmatch import fnmatch
-from pathlib import Path
-from subprocess import Popen, CompletedProcess
-from collections.abc import Mapping, Sequence
+from subprocess import Popen
def chunk(seq, n):
@@ -27,15 +25,16 @@ def chunk(seq, n):
divide a sequence into equal sized chunks
(the last chunk may be smaller, but won't be empty)
"""
+ chunks = []
some = []
for element in seq:
if len(some) == n:
- yield some
+ chunks.append(some)
some = []
- else:
- some.append(element)
+ some.append(element)
if len(some) > 0:
- yield some
+ chunks.append(some)
+ return chunks
def dechunk(chunks):
@@ -48,29 +47,18 @@ def dechunk(chunks):
def run_parallel(cmds, **kwargs):
"""
- run each of cmds (with shared **kwargs) using subprocess.Popen
- then wait for all of them to complete
- returns a list of each command's CompletedProcess
+ Run each of cmds (with shared **kwargs) using subprocess.Popen
+ then wait for all of them to complete.
+ Runs batches of os.cpu_count() * 2 from cmds
+ returns a list of tuples containing each process'
+ returncode, stdout, stderr
"""
- procs = []
- for cmd in cmds:
- if not isinstance(cmd, Sequence):
- continue
- procs.append(Popen(cmd, **kwargs))
-
- for cmd in cmds:
- if not isinstance(cmd, Mapping):
- continue
- cmd_kwargs = kwargs.copy()
- cmd_kwargs.update(cmd)
- procs.append(Popen(**cmd_kwargs))
-
complete = []
- for proc in procs:
- result = proc.communicate()
- c = CompletedProcess(proc.args, proc.returncode)
- c.stdout, c.stderr = result
- complete.append(c)
+ for cmds_batch in chunk(cmds, os.cpu_count() * 2):
+ procs_batch = [Popen(cmd, **kwargs) for cmd in cmds_batch]
+ for proc in procs_batch:
+ stdout, stderr = proc.communicate()
+ complete.append((proc.returncode, stdout, stderr))
return complete
@@ -83,16 +71,18 @@ _source_extensions = '''
def get_sources(source_dir, exclude_globs=[]):
sources = []
for directory, subdirs, basenames in os.walk(source_dir):
- for path in [Path(directory) / basename for basename in basenames]:
+ for path in [os.path.join(directory, basename) for basename in
basenames]:
# filter out non-source files
- if path.suffix not in _source_extensions:
+ if os.path.splitext(path)[1] not in _source_extensions:
continue
+ path = os.path.abspath(path)
+
# filter out files that match the globs in the globs file
- if any([fnmatch(str(path), glob) for glob in exclude_globs]):
+ if any([fnmatch(path, glob) for glob in exclude_globs]):
continue
- sources.append(path.resolve())
+ sources.append(path)
return sources
@@ -102,14 +92,15 @@ def stdout_pathcolonline(completed_process, filenames):
by printing the path name followed by ':' then a line number, examine
stdout and return the set of actually reported file names
"""
+ returncode, stdout, stderr = completed_process
bfilenames = set()
for filename in filenames:
bfilenames.add(filename.encode('utf-8') + b':')
problem_files = set()
- for line in completed_process.stdout.splitlines():
+ for line in stdout.splitlines():
for filename in bfilenames:
if line.startswith(filename):
problem_files.add(filename.decode('utf-8'))
bfilenames.remove(filename)
break
- return problem_files, completed_process.stdout
+ return problem_files, stdout
diff --git a/cpp/build-support/run_clang_format.py
b/cpp/build-support/run_clang_format.py
index 8d1bb23..1d1592d 100755
--- a/cpp/build-support/run_clang_format.py
+++ b/cpp/build-support/run_clang_format.py
@@ -16,6 +16,7 @@
# specific language governing permissions and limitations
# under the License.
+from __future__ import print_function
import lintutils
from subprocess import PIPE
import argparse
@@ -31,7 +32,8 @@ def _check_one_file(completed_processes, filename):
with open(filename, "rb") as reader:
original = reader.read()
- formatted = completed_processes[filename].stdout
+ returncode, stdout, stderr = completed_processes[filename]
+ formatted = stdout
if formatted != original:
# Run the equivalent of diff -u
diff = list(difflib.unified_diff(
@@ -92,9 +94,10 @@ if __name__ == "__main__":
[arguments.clang_format_binary, "-i"] + some
for some in lintutils.chunk(formatted_filenames, 16)
])
- for result in results:
+ for returncode, stdout, stderr in results:
# if any clang-format reported a parse error, bubble it
- result.check_returncode()
+ if returncode != 0:
+ sys.exit(returncode)
else:
# run an instance of clang-format for each source file in parallel,
@@ -103,9 +106,10 @@ if __name__ == "__main__":
[arguments.clang_format_binary, filename]
for filename in formatted_filenames
], stdout=PIPE, stderr=PIPE)
- for result in results:
+ for returncode, stdout, stderr in results:
# if any clang-format reported a parse error, bubble it
- result.check_returncode()
+ if returncode != 0:
+ sys.exit(returncode)
error = False
checker = partial(_check_one_file, {
diff --git a/cpp/build-support/run_clang_tidy.py
b/cpp/build-support/run_clang_tidy.py
index 8e1be5a..57a3e91 100755
--- a/cpp/build-support/run_clang_tidy.py
+++ b/cpp/build-support/run_clang_tidy.py
@@ -16,10 +16,11 @@
# specific language governing permissions and limitations
# under the License.
+from __future__ import print_function
import argparse
import multiprocessing as mp
import lintutils
-from subprocess import PIPE, DEVNULL
+from subprocess import PIPE
import sys
from functools import partial
@@ -36,40 +37,36 @@ def _check_some_files(completed_processes, filenames):
return lintutils.stdout_pathcolonline(result, filenames)
-def _check_all(cmd, all_filenames):
- # Output from clang-tidy can be massive, so only run 16 instances
- # of it at a time. Process and drop that output before running more
- for filenames in lintutils.chunk(all_filenames, 16 ** 2):
- # each clang-tidy instance will process 16 files
- chunks = list(lintutils.chunk(filenames, 16))
- cmds = [cmd + some for some in chunks]
- results = lintutils.run_parallel(cmds, stderr=DEVNULL, stdout=PIPE)
+def _check_all(cmd, filenames):
+ # each clang-tidy instance will process 16 files
+ chunks = lintutils.chunk(filenames, 16)
+ cmds = [cmd + some for some in chunks]
+ results = lintutils.run_parallel(cmds, stderr=PIPE, stdout=PIPE)
+ error = False
+ # record completed processes (keyed by the first filename in the input
+ # chunk) for lookup in _check_some_files
+ completed_processes = {
+ _get_chunk_key(some): result
+ for some, result in zip(chunks, results)
+ }
+ checker = partial(_check_some_files, completed_processes)
+ pool = mp.Pool()
+ try:
+ # check output of completed clang-tidy invocations in parallel
+ for problem_files, stdout in pool.imap(checker, chunks):
+ if problem_files:
+ msg = "clang-tidy suggested fixes for {}"
+ print("\n".join(map(msg.format, problem_files)))
+ error = True
+ except Exception:
+ error = True
+ raise
+ finally:
+ pool.terminate()
+ pool.join()
- error = False
- # record completed processes (keyed by the first filename in the input
chunk)
- # for lookup in _check_some_files
- completed_processes = {
- _get_chunk_key(some): result
- for some, result in zip(chunks, results)
- }
- checker = partial(_check_some_files, completed_processes)
- pool = mp.Pool()
- try:
- # check output of completed clang-tidy invocations in parallel
- for problem_files, stdout in pool.imap(checker, chunks):
- if problem_files:
- msg = "clang-tidy suggested fixes for {}"
- print("\n".join(map(msg.format, problem_files)))
- error = True
- except Exception:
- error = True
- raise
- finally:
- pool.terminate()
- pool.join()
-
- if error:
- sys.exit(1)
+ if error:
+ sys.exit(1)
if __name__ == "__main__":
@@ -99,7 +96,7 @@ if __name__ == "__main__":
linted_filenames = []
for path in lintutils.get_sources(arguments.source_dir):
- linted_filenames.append(str(path))
+ linted_filenames.append(path)
if not arguments.quiet:
msg = 'Tidying {}' if arguments.fix else 'Checking {}'
diff --git a/cpp/build-support/run_cpplint.py b/cpp/build-support/run_cpplint.py
index 83fdc96..035a02e 100755
--- a/cpp/build-support/run_cpplint.py
+++ b/cpp/build-support/run_cpplint.py
@@ -16,6 +16,7 @@
# specific language governing permissions and limitations
# under the License.
+from __future__ import print_function
import lintutils
from subprocess import PIPE, STDOUT
import argparse
@@ -91,7 +92,7 @@ if __name__ == "__main__":
linted_filenames)))
# lint files in chunks: each invocation of cpplint will process 16 files
- chunks = list(lintutils.chunk(linted_filenames, 16))
+ chunks = lintutils.chunk(linted_filenames, 16)
cmds = [cmd + some for some in chunks]
results = lintutils.run_parallel(cmds, stdout=PIPE, stderr=STDOUT)