[arrow] branch master updated: ARROW-4312: [C++] Only run 2 * os.cpu_count() clang-format instances at once

wesm Tue, 22 Jan 2019 17:22:25 -0800

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 4d6d7d5  ARROW-4312: [C++] Only run 2 * os.cpu_count() clang-format 
instances at once
4d6d7d5 is described below

commit 4d6d7d50835a8f634aae040c75f10f374790a781
Author: Benjamin Kietzman <[email protected]>
AuthorDate: Tue Jan 22 19:21:36 2019 -0600

    ARROW-4312: [C++] Only run 2 * os.cpu_count() clang-format instances at once
    
    Author: Benjamin Kietzman <[email protected]>
    
    Closes #3447 from bkietz/ARROW-4312-run-clang-format-in-batches and 
squashes the following commits:
    
    2a51818f2 <Benjamin Kietzman> launch only cpu_count()*2 parallel processes
    0f530760a <Benjamin Kietzman> only run 16**2 clang-format instances at once
---
 cpp/build-support/lintutils.py        | 59 +++++++++++++-----------------
 cpp/build-support/run_clang_format.py | 14 +++++---
 cpp/build-support/run_clang_tidy.py   | 67 +++++++++++++++++------------------
 cpp/build-support/run_cpplint.py      |  3 +-
 4 files changed, 68 insertions(+), 75 deletions(-)

diff --git a/cpp/build-support/lintutils.py b/cpp/build-support/lintutils.py
index 1657cf2..0a54daa 100644
--- a/cpp/build-support/lintutils.py
+++ b/cpp/build-support/lintutils.py
@@ -17,9 +17,7 @@
 
 import os
 from fnmatch import fnmatch
-from pathlib import Path
-from subprocess import Popen, CompletedProcess
-from collections.abc import Mapping, Sequence
+from subprocess import Popen
 
 
 def chunk(seq, n):
@@ -27,15 +25,16 @@ def chunk(seq, n):
     divide a sequence into equal sized chunks
     (the last chunk may be smaller, but won't be empty)
     """
+    chunks = []
     some = []
     for element in seq:
         if len(some) == n:
-            yield some
+            chunks.append(some)
             some = []
-        else:
-            some.append(element)
+        some.append(element)
     if len(some) > 0:
-        yield some
+        chunks.append(some)
+    return chunks
 
 
 def dechunk(chunks):
@@ -48,29 +47,18 @@ def dechunk(chunks):
 
 def run_parallel(cmds, **kwargs):
     """
-    run each of cmds (with shared **kwargs) using subprocess.Popen
-    then wait for all of them to complete
-    returns a list of each command's CompletedProcess
+    Run each of cmds (with shared **kwargs) using subprocess.Popen
+    then wait for all of them to complete.
+    Runs batches of os.cpu_count() * 2 from cmds
+    returns a list of tuples containing each process'
+    returncode, stdout, stderr
     """
-    procs = []
-    for cmd in cmds:
-        if not isinstance(cmd, Sequence):
-            continue
-        procs.append(Popen(cmd, **kwargs))
-
-    for cmd in cmds:
-        if not isinstance(cmd, Mapping):
-            continue
-        cmd_kwargs = kwargs.copy()
-        cmd_kwargs.update(cmd)
-        procs.append(Popen(**cmd_kwargs))
-
     complete = []
-    for proc in procs:
-        result = proc.communicate()
-        c = CompletedProcess(proc.args, proc.returncode)
-        c.stdout, c.stderr = result
-        complete.append(c)
+    for cmds_batch in chunk(cmds, os.cpu_count() * 2):
+        procs_batch = [Popen(cmd, **kwargs) for cmd in cmds_batch]
+        for proc in procs_batch:
+            stdout, stderr = proc.communicate()
+            complete.append((proc.returncode, stdout, stderr))
     return complete
 
 
@@ -83,16 +71,18 @@ _source_extensions = '''
 def get_sources(source_dir, exclude_globs=[]):
     sources = []
     for directory, subdirs, basenames in os.walk(source_dir):
-        for path in [Path(directory) / basename for basename in basenames]:
+        for path in [os.path.join(directory, basename) for basename in 
basenames]:
             # filter out non-source files
-            if path.suffix not in _source_extensions:
+            if os.path.splitext(path)[1] not in _source_extensions:
                 continue
 
+            path = os.path.abspath(path)
+
             # filter out files that match the globs in the globs file
-            if any([fnmatch(str(path), glob) for glob in exclude_globs]):
+            if any([fnmatch(path, glob) for glob in exclude_globs]):
                 continue
 
-            sources.append(path.resolve())
+            sources.append(path)
     return sources
 
 
@@ -102,14 +92,15 @@ def stdout_pathcolonline(completed_process, filenames):
     by printing the path name followed by ':' then a line number, examine
     stdout and return the set of actually reported file names
     """
+    returncode, stdout, stderr = completed_process
     bfilenames = set()
     for filename in filenames:
         bfilenames.add(filename.encode('utf-8') + b':')
     problem_files = set()
-    for line in completed_process.stdout.splitlines():
+    for line in stdout.splitlines():
         for filename in bfilenames:
             if line.startswith(filename):
                 problem_files.add(filename.decode('utf-8'))
                 bfilenames.remove(filename)
                 break
-    return problem_files, completed_process.stdout
+    return problem_files, stdout
diff --git a/cpp/build-support/run_clang_format.py 
b/cpp/build-support/run_clang_format.py
index 8d1bb23..1d1592d 100755
--- a/cpp/build-support/run_clang_format.py
+++ b/cpp/build-support/run_clang_format.py
@@ -16,6 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from __future__ import print_function
 import lintutils
 from subprocess import PIPE
 import argparse
@@ -31,7 +32,8 @@ def _check_one_file(completed_processes, filename):
     with open(filename, "rb") as reader:
         original = reader.read()
 
-    formatted = completed_processes[filename].stdout
+    returncode, stdout, stderr = completed_processes[filename]
+    formatted = stdout
     if formatted != original:
         # Run the equivalent of diff -u
         diff = list(difflib.unified_diff(
@@ -92,9 +94,10 @@ if __name__ == "__main__":
             [arguments.clang_format_binary, "-i"] + some
             for some in lintutils.chunk(formatted_filenames, 16)
         ])
-        for result in results:
+        for returncode, stdout, stderr in results:
             # if any clang-format reported a parse error, bubble it
-            result.check_returncode()
+            if returncode != 0:
+                sys.exit(returncode)
 
     else:
         # run an instance of clang-format for each source file in parallel,
@@ -103,9 +106,10 @@ if __name__ == "__main__":
             [arguments.clang_format_binary, filename]
             for filename in formatted_filenames
         ], stdout=PIPE, stderr=PIPE)
-        for result in results:
+        for returncode, stdout, stderr in results:
             # if any clang-format reported a parse error, bubble it
-            result.check_returncode()
+            if returncode != 0:
+                sys.exit(returncode)
 
         error = False
         checker = partial(_check_one_file, {
diff --git a/cpp/build-support/run_clang_tidy.py 
b/cpp/build-support/run_clang_tidy.py
index 8e1be5a..57a3e91 100755
--- a/cpp/build-support/run_clang_tidy.py
+++ b/cpp/build-support/run_clang_tidy.py
@@ -16,10 +16,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from __future__ import print_function
 import argparse
 import multiprocessing as mp
 import lintutils
-from subprocess import PIPE, DEVNULL
+from subprocess import PIPE
 import sys
 from functools import partial
 
@@ -36,40 +37,36 @@ def _check_some_files(completed_processes, filenames):
     return lintutils.stdout_pathcolonline(result, filenames)
 
 
-def _check_all(cmd, all_filenames):
-    # Output from clang-tidy can be massive, so only run 16 instances
-    # of it at a time. Process and drop that output before running more
-    for filenames in lintutils.chunk(all_filenames, 16 ** 2):
-        # each clang-tidy instance will process 16 files
-        chunks = list(lintutils.chunk(filenames, 16))
-        cmds = [cmd + some for some in chunks]
-        results = lintutils.run_parallel(cmds, stderr=DEVNULL, stdout=PIPE)
+def _check_all(cmd, filenames):
+    # each clang-tidy instance will process 16 files
+    chunks = lintutils.chunk(filenames, 16)
+    cmds = [cmd + some for some in chunks]
+    results = lintutils.run_parallel(cmds, stderr=PIPE, stdout=PIPE)
+    error = False
+    # record completed processes (keyed by the first filename in the input
+    # chunk) for lookup in _check_some_files
+    completed_processes = {
+        _get_chunk_key(some): result
+        for some, result in zip(chunks, results)
+    }
+    checker = partial(_check_some_files, completed_processes)
+    pool = mp.Pool()
+    try:
+        # check output of completed clang-tidy invocations in parallel
+        for problem_files, stdout in pool.imap(checker, chunks):
+            if problem_files:
+                msg = "clang-tidy suggested fixes for {}"
+                print("\n".join(map(msg.format, problem_files)))
+                error = True
+    except Exception:
+        error = True
+        raise
+    finally:
+        pool.terminate()
+        pool.join()
 
-        error = False
-        # record completed processes (keyed by the first filename in the input 
chunk)
-        # for lookup in _check_some_files
-        completed_processes = {
-            _get_chunk_key(some): result
-            for some, result in zip(chunks, results)
-        }
-        checker = partial(_check_some_files, completed_processes)
-        pool = mp.Pool()
-        try:
-            # check output of completed clang-tidy invocations in parallel
-            for problem_files, stdout in pool.imap(checker, chunks):
-                if problem_files:
-                    msg = "clang-tidy suggested fixes for {}"
-                    print("\n".join(map(msg.format, problem_files)))
-                    error = True
-        except Exception:
-            error = True
-            raise
-        finally:
-            pool.terminate()
-            pool.join()
-
-        if error:
-            sys.exit(1)
+    if error:
+        sys.exit(1)
 
 
 if __name__ == "__main__":
@@ -99,7 +96,7 @@ if __name__ == "__main__":
 
     linted_filenames = []
     for path in lintutils.get_sources(arguments.source_dir):
-        linted_filenames.append(str(path))
+        linted_filenames.append(path)
 
     if not arguments.quiet:
         msg = 'Tidying {}' if arguments.fix else 'Checking {}'
diff --git a/cpp/build-support/run_cpplint.py b/cpp/build-support/run_cpplint.py
index 83fdc96..035a02e 100755
--- a/cpp/build-support/run_cpplint.py
+++ b/cpp/build-support/run_cpplint.py
@@ -16,6 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from __future__ import print_function
 import lintutils
 from subprocess import PIPE, STDOUT
 import argparse
@@ -91,7 +92,7 @@ if __name__ == "__main__":
                             linted_filenames)))
 
     # lint files in chunks: each invocation of cpplint will process 16 files
-    chunks = list(lintutils.chunk(linted_filenames, 16))
+    chunks = lintutils.chunk(linted_filenames, 16)
     cmds = [cmd + some for some in chunks]
     results = lintutils.run_parallel(cmds, stdout=PIPE, stderr=STDOUT)

[arrow] branch master updated: ARROW-4312: [C++] Only run 2 * os.cpu_count() clang-format instances at once

Reply via email to