Copilot commented on code in PR #50056:
URL: https://github.com/apache/arrow/pull/50056#discussion_r3460072137


##########
dev/archery/archery/benchmark/runner.py:
##########
@@ -211,48 +247,59 @@ def from_rev_or_path(src, root, rev_or_path, cmake_conf, 
**kwargs):
         """
         build = None
         if StaticBenchmarkRunner.is_json_result(rev_or_path):
-            kwargs.pop('benchmark_extras', None)
+            kwargs.pop("benchmark_extras", None)
             return StaticBenchmarkRunner.from_json(rev_or_path, **kwargs)
         elif CMakeBuild.is_build_dir(rev_or_path):
             build = CMakeBuild.from_path(rev_or_path)
             return CppBenchmarkRunner(build, **kwargs)
         else:
-            # Revisions can references remote via the `/` character, ensure
-            # that the revision is path friendly
-            path_rev = rev_or_path.replace("/", "_")
+            path_rev = _rev_or_path_dirname(src, rev_or_path)
             root_rev = os.path.join(root, path_rev)
-            os.mkdir(root_rev)
+            os.makedirs(root_rev, exist_ok=True)
 
+            # Clone dir is reused when there is known revision (git sha)
+            # in <preserve-dir>/sha/arrow
             clone_dir = os.path.join(root_rev, "arrow")
-            # Possibly checkout the sources at given revision, no need to
-            # perform cleanup on cloned repository as root_rev is reclaimed.
-            src_rev, _ = src.at_revision(rev_or_path, clone_dir)
+            if os.path.isdir(clone_dir):
+                src_rev = ArrowSources(clone_dir)
+            else:
+                src_rev, _ = src.at_revision(rev_or_path, clone_dir)
             cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf)
-            build_dir = os.path.join(root_rev, "build")
-            return CppBenchmarkRunner(cmake_def.build(build_dir), **kwargs)
+            run_root = os.path.join(root_rev, "build")
+            os.makedirs(run_root, exist_ok=True)
+            # run_id is a path-safe ISO-8601 UTC timestamp down to the second
+            # so users get a brief idea of what run they are looking at.
+            now = datetime.datetime.now(datetime.timezone.utc)
+            run_id = now.strftime("%Y-%m-%dT%H-%M-%SZ")
+            build_dir = os.path.join(run_root, run_id)

Review Comment:
   `run_id` only has second-level precision. Two benchmark invocations started 
within the same second (or a rerun after a partial failure) will collide on the 
same `build_dir` / `results_dir`, potentially reusing or clobbering artifacts 
unexpectedly. Use microseconds (or add a random suffix) to make the directory 
name reliably unique.



##########
dev/archery/archery/benchmark/runner.py:
##########
@@ -211,48 +247,59 @@ def from_rev_or_path(src, root, rev_or_path, cmake_conf, 
**kwargs):
         """
         build = None
         if StaticBenchmarkRunner.is_json_result(rev_or_path):
-            kwargs.pop('benchmark_extras', None)
+            kwargs.pop("benchmark_extras", None)
             return StaticBenchmarkRunner.from_json(rev_or_path, **kwargs)
         elif CMakeBuild.is_build_dir(rev_or_path):
             build = CMakeBuild.from_path(rev_or_path)
             return CppBenchmarkRunner(build, **kwargs)
         else:
-            # Revisions can references remote via the `/` character, ensure
-            # that the revision is path friendly
-            path_rev = rev_or_path.replace("/", "_")
+            path_rev = _rev_or_path_dirname(src, rev_or_path)
             root_rev = os.path.join(root, path_rev)
-            os.mkdir(root_rev)
+            os.makedirs(root_rev, exist_ok=True)
 
+            # Clone dir is reused when there is known revision (git sha)
+            # in <preserve-dir>/sha/arrow
             clone_dir = os.path.join(root_rev, "arrow")
-            # Possibly checkout the sources at given revision, no need to
-            # perform cleanup on cloned repository as root_rev is reclaimed.
-            src_rev, _ = src.at_revision(rev_or_path, clone_dir)
+            if os.path.isdir(clone_dir):
+                src_rev = ArrowSources(clone_dir)
+            else:
+                src_rev, _ = src.at_revision(rev_or_path, clone_dir)
             cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf)
-            build_dir = os.path.join(root_rev, "build")
-            return CppBenchmarkRunner(cmake_def.build(build_dir), **kwargs)
+            run_root = os.path.join(root_rev, "build")
+            os.makedirs(run_root, exist_ok=True)

Review Comment:
   The PR description shows build artifacts under `build/run/<id>`, but the 
code currently writes to `build/<id>`. If the intended layout includes the 
extra `run/` level, set `run_root` accordingly so the on-disk structure matches 
the documented layout.



##########
dev/archery/archery/cli.py:
##########
@@ -465,6 +494,15 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, 
cmake_extras,
         # when asked to JSON-serialize the results, so produce a JSON
         # output even when none is requested.
         json_out = json.dumps(runner_base, cls=JsonEncoder)
+        if runner_base.results_dir is not None:
+            results_path = os.path.join(runner_base.results_dir,
+                                        "benchmark.json")
+            with open(results_path, "w") as f:

Review Comment:
   New behavior writes `benchmark.json` and `metadata.json` into 
`runner_base.results_dir`, but there doesn't appear to be a CLI test exercising 
`archery benchmark run --preserve --preserve-dir ...` and asserting these files 
are created. Adding a `CliRunner` test with a mocked 
`CppBenchmarkRunner.from_rev_or_path()` (setting `results_dir` to a temp 
directory) would help prevent regressions.



##########
dev/archery/archery/cli.py:
##########
@@ -41,6 +43,27 @@
 BOOL = ArrowBool()
 
 
+def _run_metadata(ctx):
+    uname = platform.uname()
+    return {
+        "time": datetime.datetime.now(datetime.timezone.utc).isoformat(),
+        "cmd": {
+            "full": " ".join(sys.argv),
+            "params": ctx.params,
+        },

Review Comment:
   `"full": " ".join(sys.argv)` loses quoting/escaping information (arguments 
containing spaces, quotes, etc.), making the recorded command hard to reliably 
re-run. Consider also storing the raw argv list so the invocation can be 
reconstructed unambiguously.



##########
dev/archery/archery/benchmark/runner.py:
##########
@@ -211,48 +247,59 @@ def from_rev_or_path(src, root, rev_or_path, cmake_conf, 
**kwargs):
         """
         build = None
         if StaticBenchmarkRunner.is_json_result(rev_or_path):
-            kwargs.pop('benchmark_extras', None)
+            kwargs.pop("benchmark_extras", None)
             return StaticBenchmarkRunner.from_json(rev_or_path, **kwargs)
         elif CMakeBuild.is_build_dir(rev_or_path):
             build = CMakeBuild.from_path(rev_or_path)
             return CppBenchmarkRunner(build, **kwargs)
         else:
-            # Revisions can references remote via the `/` character, ensure
-            # that the revision is path friendly
-            path_rev = rev_or_path.replace("/", "_")
+            path_rev = _rev_or_path_dirname(src, rev_or_path)
             root_rev = os.path.join(root, path_rev)
-            os.mkdir(root_rev)
+            os.makedirs(root_rev, exist_ok=True)
 
+            # Clone dir is reused when there is known revision (git sha)
+            # in <preserve-dir>/sha/arrow
             clone_dir = os.path.join(root_rev, "arrow")
-            # Possibly checkout the sources at given revision, no need to
-            # perform cleanup on cloned repository as root_rev is reclaimed.
-            src_rev, _ = src.at_revision(rev_or_path, clone_dir)
+            if os.path.isdir(clone_dir):
+                src_rev = ArrowSources(clone_dir)
+            else:
+                src_rev, _ = src.at_revision(rev_or_path, clone_dir)
             cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf)
-            build_dir = os.path.join(root_rev, "build")
-            return CppBenchmarkRunner(cmake_def.build(build_dir), **kwargs)
+            run_root = os.path.join(root_rev, "build")
+            os.makedirs(run_root, exist_ok=True)
+            # run_id is a path-safe ISO-8601 UTC timestamp down to the second
+            # so users get a brief idea of what run they are looking at.
+            now = datetime.datetime.now(datetime.timezone.utc)
+            run_id = now.strftime("%Y-%m-%dT%H-%M-%SZ")
+            build_dir = os.path.join(run_root, run_id)
+            build = cmake_def.build(build_dir)
+            results_dir = os.path.join(root_rev, "bench", run_id)
+            os.makedirs(results_dir, exist_ok=True)

Review Comment:
   The PR description shows benchmark outputs under `bench/run/<id>`, but the 
code currently writes to `bench/<id>`. If `run/` is part of the intended 
preserved layout, include it in `results_dir` for consistency with the 
documented structure.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to