This is an automated email from the ASF dual-hosted git repository.

assignuser pushed a commit to branch release-20.0.0-rc1
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 1e2aa39012961e85a719857f0089fa04ab9119b3
Author: Jacob Wujciak-Jens <[email protected]>
AuthorDate: Tue Apr 15 17:27:09 2025 +0200

    GH-46075: [Release][CI] Fix binary verification (#46076)
    
    ### Rationale for this change
    
    ### What changes are included in this PR?
    
    ### Are these changes tested?
    
    ### Are there any user-facing changes?
    
    **This PR includes breaking changes to public APIs.** (If there are any 
breaking changes to public APIs, please explain which changes are breaking. If 
not, you can remove this.)
    
    **This PR contains a "Critical Fix".** (If the changes fix either (a) a 
security vulnerability, (b) a bug that caused incorrect or invalid data to be 
produced, or (c) a bug that causes a crash (even when the API contract is 
upheld), please provide explanation. If not, you can remove this.)
    
    * GitHub Issue: #46075
    
    Authored-by: Jacob Wujciak-Jens <[email protected]>
    Signed-off-by: Jacob Wujciak-Jens <[email protected]>
---
 .github/workflows/verify_rc.yml                 |  44 ++++--
 dev/release/download_rc_binaries.py             | 178 ++++++++++++++----------
 dev/release/verify-release-candidate-wheels.bat |  31 ++---
 dev/release/verify-release-candidate.sh         |  10 +-
 4 files changed, 155 insertions(+), 108 deletions(-)

diff --git a/.github/workflows/verify_rc.yml b/.github/workflows/verify_rc.yml
index fe46ae6f23..dceb04a492 100644
--- a/.github/workflows/verify_rc.yml
+++ b/.github/workflows/verify_rc.yml
@@ -21,6 +21,16 @@ on:
   push:
     tags:
       - "*-rc*"
+  pull_request:
+    paths:
+      - ".github/workflows/verify_rc.yml"
+  workflow_dispatch:
+    inputs:
+      rc_tag:
+        description: "Tag of the rc to verify"
+        type: string
+        required: true
+
 
 permissions:
   contents: read
@@ -28,6 +38,7 @@ permissions:
 env:
   TEST_DEFAULT: "0"
   VERBOSE: "1"
+  RC_TAG: "${{ inputs.rc_tag || github.event_name == 'pull_request' && 
'apache-arrow-20.0.0-rc0' || github.ref_name }}"
 
 jobs:
   apt:
@@ -46,9 +57,9 @@ jobs:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 
v4.2.2
       - name: Run
         run: |
-          package_id=${GITHUB_REF_NAME%-rc*}
+          package_id=${RC_TAG%-rc*}
           version=${package_id#apache-arrow-}
-          rc=${GITHUB_REF_NAME#*-rc}
+          rc=${RC_TAG#*-rc}
           dev/release/verify-release-candidate.sh ${version} ${rc}
 
   binary:
@@ -61,9 +72,9 @@ jobs:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 
v4.2.2
       - name: Run
         run: |
-          package_id=${GITHUB_REF_NAME%-rc*}
+          package_id=${RC_TAG%-rc*}
           version=${package_id#apache-arrow-}
-          rc=${GITHUB_REF_NAME#*-rc}
+          rc=${RC_TAG#*-rc}
           dev/release/verify-release-candidate.sh ${version} ${rc}
 
   wheels-linux:
@@ -89,9 +100,9 @@ jobs:
         run: python -m pip install -e dev/archery[docker]
       - name: Prepare
         run: |
-          package_id=${GITHUB_REF_NAME%-rc*}
+          package_id=${RC_TAG%-rc*}
           echo "VERSION=${package_id#apache-arrow-}" >> ${GITHUB_ENV}
-          echo "RC=${GITHUB_REF_NAME#*-rc}" >> ${GITHUB_ENV}
+          echo "RC=${RC_TAG#*-rc}" >> ${GITHUB_ENV}
           distro=${{ matrix.distro }}
           if [ "${distro}" = "conda" ]; then
             echo "SERVICE=${distro}-verify-rc" >> ${GITHUB_ENV}
@@ -102,6 +113,8 @@ jobs:
             echo "$(echo ${os} | tr a-z A-Z)=${version}" >> ${GITHUB_ENV}
           fi
       - name: Run
+        env:
+          GH_TOKEN: ${{ github.token }}
         run: |
           archery docker run \
             -e TEST_DEFAULT="${TEST_DEFAULT}" \
@@ -109,6 +122,7 @@ jobs:
             -e VERBOSE="${VERBOSE}" \
             -e VERIFY_RC="${RC}" \
             -e VERIFY_VERSION="${VERSION}" \
+            -e GH_TOKEN="$GH_TOKEN" \
             ${SERVICE}
 
   wheels-macos:
@@ -126,10 +140,12 @@ jobs:
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 
v4.2.2
       - name: Run
+        env:
+          GH_TOKEN: ${{ github.token }}
         run: |
-          package_id=${GITHUB_REF_NAME%-rc*}
+          package_id=${RC_TAG%-rc*}
           version=${package_id#apache-arrow-}
-          rc=${GITHUB_REF_NAME#*-rc}
+          rc=${RC_TAG#*-rc}
           dev/release/verify-release-candidate.sh ${version} ${rc}
 
   wheels-windows:
@@ -141,12 +157,14 @@ jobs:
       TEST_WHEELS: "1"
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 
v4.2.2
+        with:
+          submodules: recursive
       - name: Prepare
         shell: bash
         run: |
-          package_id=${GITHUB_REF_NAME%-rc*}
+          package_id=${RC_TAG%-rc*}
           echo "VERSION=${package_id#apache-arrow-}" >> ${GITHUB_ENV}
-          echo "RC=${GITHUB_REF_NAME#*-rc}" >> ${GITHUB_ENV}
+          echo "RC=${RC_TAG#*-rc}" >> ${GITHUB_ENV}
       - uses: 
conda-incubator/setup-miniconda@505e6394dae86d6a5c7fbb6e3fb8938e3e863830 # 
v3.1.1
       - name: Install System Dependencies
         run: |
@@ -156,6 +174,8 @@ jobs:
         shell: bash
         run: ci/scripts/download_tz_database.sh
       - name: Run verification
+        env:
+          GH_TOKEN: ${{ github.token }}
         shell: cmd
         run: |
           dev/release/verify-release-candidate-wheels.bat %VERSION% %RC%
@@ -176,7 +196,7 @@ jobs:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 
v4.2.2
       - name: Run
         run: |
-          package_id=${GITHUB_REF_NAME%-rc*}
+          package_id=${RC_TAG%-rc*}
           version=${package_id#apache-arrow-}
-          rc=${GITHUB_REF_NAME#*-rc}
+          rc=${RC_TAG#*-rc}
           dev/release/verify-release-candidate.sh ${version} ${rc}
diff --git a/dev/release/download_rc_binaries.py 
b/dev/release/download_rc_binaries.py
index 3bc0012116..01f6588c6d 100755
--- a/dev/release/download_rc_binaries.py
+++ b/dev/release/download_rc_binaries.py
@@ -28,7 +28,6 @@ import subprocess
 import time
 import urllib.request
 
-
 DEFAULT_PARALLEL_DOWNLOADS = 8
 
 
@@ -36,33 +35,31 @@ class Downloader:
 
     def get_file_list(self, prefix, filter=None):
         def traverse(directory, files, directories):
-            url = f'{self.URL_ROOT}/{directory}'
+            url = f"{self.URL_ROOT}/{directory}"
             response = urllib.request.urlopen(url).read().decode()
             paths = re.findall('<a href="(.+?)"', response)
             for path in paths:
-                path = re.sub(f'^{re.escape(url)}',
-                              '',
-                              path)
-                if path == '../':
+                path = re.sub(f"^{re.escape(url)}", "", path)
+                if path == "../":
                     continue
-                resolved_path = f'{directory}{path}'
+                resolved_path = f"{directory}{path}"
                 if filter and not filter(path):
                     continue
-                if path.endswith('/'):
+                if path.endswith("/"):
                     directories.append(resolved_path)
                 else:
                     files.append(resolved_path)
+
         files = []
-        if prefix != '' and not prefix.endswith('/'):
-            prefix += '/'
+        if prefix != "" and not prefix.endswith("/"):
+            prefix += "/"
         directories = [prefix]
         while len(directories) > 0:
             directory = directories.pop()
             traverse(directory, files, directories)
         return files
 
-    def download_files(self, files, dest=None, num_parallel=None,
-                       re_match=None):
+    def download_files(self, files, dest=None, num_parallel=None, 
re_match=None):
         """
         Download files from Bintray in parallel. If file already exists, will
         overwrite if the checksum does not match what Bintray says it should be
@@ -83,19 +80,21 @@ class Downloader:
             num_parallel = DEFAULT_PARALLEL_DOWNLOADS
 
         if re_match is not None:
-            regex = re.compile(re_match)
-            files = [x for x in files if regex.match(x)]
+            files = self._filter_files(files, re_match)
 
         if num_parallel == 1:
             for path in files:
                 self._download_file(dest, path)
         else:
             parallel_map_terminate_early(
-                functools.partial(self._download_file, dest),
-                files,
-                num_parallel
+                functools.partial(self._download_file,
+                                  dest), files, num_parallel
             )
 
+    def _filter_files(self, files, re_match):
+        regex = re.compile(re_match)
+        return [x for x in files if regex.match(x)]
+
     def _download_file(self, dest, path):
         base, filename = os.path.split(path)
 
@@ -106,7 +105,7 @@ class Downloader:
 
         print("Downloading {} to {}".format(path, dest_path))
 
-        url = f'{self.URL_ROOT}/{path}'
+        url = f"{self.URL_ROOT}/{path}"
         self._download_url(url, dest_path)
 
     def _download_url(self, url, dest_path, *, extra_args=None):
@@ -128,8 +127,8 @@ class Downloader:
                 delay = attempt * 3
                 print(f"Waiting {delay} seconds before retrying {url}")
                 time.sleep(delay)
-            proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
-                                    stderr=subprocess.PIPE)
+            proc = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             stdout, stderr = proc.communicate()
             if proc.returncode != 0:
                 try:
@@ -142,8 +141,9 @@ class Downloader:
                     break
             else:
                 return
-        raise Exception(f"Downloading {url} failed\n"
-                        f"stdout: {stdout}\nstderr: {stderr}")
+        raise Exception(
+            f"Downloading {url} failed\n" f"stdout: {stdout}\nstderr: {stderr}"
+        )
 
     def _curl_version(self):
         cmd = ["curl", "--version"]
@@ -157,8 +157,10 @@ class Artifactory(Downloader):
 
 
 class Maven(Downloader):
-    URL_ROOT = "https://repository.apache.org"; + \
-        "/content/repositories/staging/org/apache/arrow"
+    URL_ROOT = (
+        "https://repository.apache.org";
+        + "/content/repositories/staging/org/apache/arrow"
+    )
 
 
 class GitHub(Downloader):
@@ -174,8 +176,10 @@ class GitHub(Downloader):
         self._token = os.environ.get("GH_TOKEN")
 
     def get_file_list(self, prefix, filter=None):
-        url = (f"https://api.github.com/repos/{self._repository}/";
-               f"releases/tags/{self._tag}")
+        url = (
+            f"https://api.github.com/repos/{self._repository}/";
+            f"releases/tags/{self._tag}"
+        )
         print("Fetching release from", url)
         headers = {
             "Accept": "application/vnd.github+json",
@@ -204,6 +208,10 @@ class GitHub(Downloader):
             files.append((asset["name"], url))
         return files
 
+    def _filter_files(self, files, re_match):
+        regex = re.compile(re_match)
+        return [x for x in files if regex.match(x[0])]
+
     def _download_file(self, dest, asset):
         name, url = asset
 
@@ -226,11 +234,7 @@ class GitHub(Downloader):
         if self._curl_version() >= (7, 71, 0):
             # Also retry 403s
             extra_args.append("--retry-all-errors")
-        self._download_url(
-            url,
-            dest_path,
-            extra_args=extra_args
-        )
+        self._download_url(url, dest_path, extra_args=extra_args)
 
 
 def parallel_map_terminate_early(f, iterable, num_parallel):
@@ -248,38 +252,45 @@ def parallel_map_terminate_early(f, iterable, 
num_parallel):
 
 
 ARROW_REPOSITORY_PACKAGE_TYPES = [
-    'almalinux',
-    'amazon-linux',
-    'centos',
-    'debian',
-    'ubuntu',
+    "almalinux",
+    "amazon-linux",
+    "centos",
+    "debian",
+    "ubuntu",
 ]
-ARROW_STANDALONE_PACKAGE_TYPES = ['nuget', 'python']
-ARROW_PACKAGE_TYPES = \
-    ARROW_REPOSITORY_PACKAGE_TYPES + \
-    ARROW_STANDALONE_PACKAGE_TYPES
-
-
-def download_rc_binaries(version, rc_number, re_match=None, dest=None,
-                         num_parallel=None, target_package_type=None,
-                         repository=None, tag=None):
-    version_string = '{}-rc{}'.format(version, rc_number)
-    version_pattern = re.compile(r'\d+\.\d+\.\d+')
+ARROW_STANDALONE_PACKAGE_TYPES = ["nuget", "python"]
+ARROW_PACKAGE_TYPES = ARROW_REPOSITORY_PACKAGE_TYPES + 
ARROW_STANDALONE_PACKAGE_TYPES
+
+
+def download_rc_binaries(
+    version,
+    rc_number,
+    re_match=None,
+    dest=None,
+    num_parallel=None,
+    target_package_type=None,
+    repository=None,
+    tag=None,
+):
+    version_string = "{}-rc{}".format(version, rc_number)
+    version_pattern = re.compile(r"\d+\.\d+\.\d+")
     if target_package_type:
         package_types = [target_package_type]
     else:
         package_types = ARROW_PACKAGE_TYPES
     for package_type in package_types:
+
         def is_target(path):
             match = version_pattern.search(path)
             if not match:
                 return True
             return match[0] == version
+
         filter = is_target
 
-        if package_type == 'github' or package_type == 'nuget':
+        if package_type == "github" or package_type in 
ARROW_STANDALONE_PACKAGE_TYPES:
             downloader = GitHub(repository, tag)
-            prefix = ''
+            prefix = ""
             filter = None
         elif package_type in ARROW_REPOSITORY_PACKAGE_TYPES:
             downloader = Artifactory()
@@ -289,33 +300,56 @@ def download_rc_binaries(version, rc_number, 
re_match=None, dest=None,
             prefix = f'{package_type}-rc/{version_string}'
             filter = None
         files = downloader.get_file_list(prefix, filter=filter)
-        downloader.download_files(files, re_match=re_match, dest=dest,
-                                  num_parallel=num_parallel)
+        downloader.download_files(
+            files, re_match=re_match, dest=dest, num_parallel=num_parallel
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description='Download release candidate binaries'
+        description="Download release candidate binaries")
+    parser.add_argument("version", type=str, help="The version number")
+    parser.add_argument(
+        "rc_number", type=int, help="The release candidate number, e.g. 0, 1, 
etc"
+    )
+    parser.add_argument(
+        "-e",
+        "--regexp",
+        type=str,
+        default=None,
+        help=(
+            "Regular expression to match on file names "
+            "to only download certain files"
+        ),
+    )
+    parser.add_argument(
+        "--dest",
+        type=str,
+        default=os.getcwd(),
+        help="The output folder for the downloaded files",
+    )
+    parser.add_argument(
+        "--num_parallel",
+        type=int,
+        default=DEFAULT_PARALLEL_DOWNLOADS,
+        help="The number of concurrent downloads to do",
+    )
+    parser.add_argument(
+        "--package_type",
+        type=str,
+        default=None,
+        help="The package type to be downloaded",
+    )
+    parser.add_argument(
+        "--repository",
+        type=str,
+        help=("The repository to pull from " "(required if 
--package_type=github)"),
+    )
+    parser.add_argument(
+        "--tag",
+        type=str,
+        help=("The release tag to download " "(required if 
--package_type=github)"),
     )
-    parser.add_argument('version', type=str, help='The version number')
-    parser.add_argument('rc_number', type=int,
-                        help='The release candidate number, e.g. 0, 1, etc')
-    parser.add_argument('-e', '--regexp', type=str, default=None,
-                        help=('Regular expression to match on file names '
-                              'to only download certain files'))
-    parser.add_argument('--dest', type=str, default=os.getcwd(),
-                        help='The output folder for the downloaded files')
-    parser.add_argument('--num_parallel', type=int,
-                        default=DEFAULT_PARALLEL_DOWNLOADS,
-                        help='The number of concurrent downloads to do')
-    parser.add_argument('--package_type', type=str, default=None,
-                        help='The package type to be downloaded')
-    parser.add_argument('--repository', type=str,
-                        help=('The repository to pull from '
-                              '(required if --package_type=github)'))
-    parser.add_argument('--tag', type=str,
-                        help=('The release tag to download '
-                              '(required if --package_type=github)'))
     args = parser.parse_args()
 
     download_rc_binaries(
diff --git a/dev/release/verify-release-candidate-wheels.bat 
b/dev/release/verify-release-candidate-wheels.bat
index a9a4703fae..e41d2dbf25 100644
--- a/dev/release/verify-release-candidate-wheels.bat
+++ b/dev/release/verify-release-candidate-wheels.bat
@@ -33,29 +33,18 @@ if not exist %_VERIFICATION_DIR% mkdir %_VERIFICATION_DIR%
 
 cd %_VERIFICATION_DIR%
 
-@rem clone Arrow repository to obtain test requirements
-set GIT_ENV_PATH=%_VERIFICATION_DIR%\_git
-call conda create -p %GIT_ENV_PATH% ^
-    --no-shortcuts -f -q -y git ^
-    || EXIT /B 1
-call activate %GIT_ENV_PATH%
-
-git clone https://github.com/apache/arrow.git || EXIT /B 1
-pushd arrow
-git submodule update --init
-popd
-
 set ARROW_VERSION=%1
 set RC_NUMBER=%2
 
-python arrow\dev\release\download_rc_binaries.py %ARROW_VERSION% %RC_NUMBER% ^
-    --package_type python ^
+python dev\release\download_rc_binaries.py %ARROW_VERSION% %RC_NUMBER% ^
+    --package_type="python" ^
+    --repository="apache/arrow" ^
+    --dest="%_VERIFICATION_DIR%" ^
+    --tag="apache-arrow-%ARROW_VERSION%-rc%RC_NUMBER%" ^
     --regex=".*win_amd64.*" || EXIT /B 1
 
-call deactivate
-
-set ARROW_TEST_DATA=%cd%\arrow\testing\data
-set PARQUET_TEST_DATA=%cd%\arrow\cpp\submodules\parquet-testing\data
+set ARROW_TEST_DATA=%cd%\testing\data
+set PARQUET_TEST_DATA=%cd%\cpp\submodules\parquet-testing\data
 
 
 CALL :verify_wheel 3.9
@@ -99,13 +88,13 @@ call activate %CONDA_ENV_PATH%
 
 set 
WHEEL_FILENAME=pyarrow-%ARROW_VERSION%-cp%PY_VERSION_NO_PERIOD%-cp%PY_VERSION_NO_PERIOD%%ABI_TAG%-win_amd64.whl
 
-pip install python-rc\%ARROW_VERSION%-rc%RC_NUMBER%\%WHEEL_FILENAME% || EXIT 
/B 1
+pip install %_VERIFICATION_DIR%\%WHEEL_FILENAME% || EXIT /B 1
 python -c "import pyarrow" || EXIT /B 1
 python -c "import pyarrow.parquet" || EXIT /B 1
 python -c "import pyarrow.flight" || EXIT /B 1
 python -c "import pyarrow.dataset" || EXIT /B 1
 
-pip install -r arrow\python\requirements-test.txt || EXIT /B 1
+pip install -r %_CURRENT_DIR%\python\requirements-test.txt || EXIT /B 1
 
 set PYARROW_TEST_CYTHON=OFF
 set TZDIR=%CONDA_ENV_PATH%\share\zoneinfo
@@ -113,6 +102,6 @@ pytest %CONDA_ENV_PATH%\Lib\site-packages\pyarrow --pdb -v 
|| EXIT /B 1
 
 :done
 
-call deactivate
+call conda deactivate
 
 EXIT /B 0
diff --git a/dev/release/verify-release-candidate.sh 
b/dev/release/verify-release-candidate.sh
index d7ffcdb0af..21afb90d93 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -168,6 +168,7 @@ verify_dir_artifact_signatures() {
 }
 
 test_binary() {
+  # this downloads all artifacts and verifies their checksums and signatures
   show_header "Testing binary artifacts"
   maybe_setup_conda
 
@@ -176,7 +177,8 @@ test_binary() {
 
   ${PYTHON:-python3} $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER \
          --dest=${download_dir} \
-         --repository=${GITHUB_REPOSITORY:-apache/arrow}
+         --repository=${GITHUB_REPOSITORY:-apache/arrow} \
+         --tag="apache-arrow-$VERSION-rc$RC_NUMBER"
 
   verify_dir_artifact_signatures ${download_dir}
 }
@@ -1049,11 +1051,13 @@ test_wheels() {
       $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER \
       --package_type python \
       --regex=${filter_regex} \
-      --dest=${download_dir}
+      --dest=${download_dir} \
+      --repository=${GITHUB_REPOSITORY:-apache/arrow} \
+      --tag="apache-arrow-$VERSION-rc$RC_NUMBER"
 
     verify_dir_artifact_signatures ${download_dir}
 
-    wheels_dir=${download_dir}/python-rc/${VERSION}-rc${RC_NUMBER}
+    wheels_dir=${download_dir}
   fi
 
   pushd ${wheels_dir}

Reply via email to