This is an automated email from the ASF dual-hosted git repository. potiuk pushed a commit to branch change-free-type-to-choice in repository https://gitbox.apache.org/repos/asf/airflow-site-archive.git
commit 07cc55aa17b03bc924ddfdd93d10ff4d805ada0d Author: Jarek Potiuk <ja...@potiuk.com> AuthorDate: Fri May 9 19:55:17 2025 +0200 Small improvements in github to s3 workflow --- .github/workflows/github-to-s3.yml | 23 +++++++++++++++++------ scripts/github_to_s3.py | 26 ++++++++++++++++---------- scripts/transfer_utils.py | 2 +- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/.github/workflows/github-to-s3.yml b/.github/workflows/github-to-s3.yml index f92e69e300..f81fa56b07 100644 --- a/.github/workflows/github-to-s3.yml +++ b/.github/workflows/github-to-s3.yml @@ -33,16 +33,21 @@ on: # yamllint disable-line rule:truthy document-folder: description: "Provide any specific package document folder to sync" required: false - default: "NO_DOCS" + default: "" type: string sync-type: description: "Perform a full sync or just sync the last commit" required: false - default: "last_commit" + default: "single_commit" type: choice options: - - last_commit + - single_commit - full_sync + commit-sha: + description: "If specified, commit SHA used for single_commit (default is latest commit)" + required: false + default: "" + type: string processes: description: "Number of processes to use for syncing" required: false @@ -87,13 +92,19 @@ jobs: run: | git clone --depth 2 https://github.com/apache/airflow-site-archive.git /mnt/airflow-site-archive - - name: "Syncing ${{ github.ref }} ( ${{ github.sha }} )" + - name: "Syncing ( ${{ inputs.commit-sha || github.sha }} ): ${{ inputs.sync-type }} ${{inputs.document-folder}}" env: - COMMIT_SHA: ${{ github.sha }} + COMMIT_SHA: ${{ inputs.commit-sha || github.sha }} + SYNC_TYPE: ${{ inputs.sync-type }} run: | + if [[ "${SYNC_TYPE}" == "single_commit" ]]; then + echo "Syncing ${COMMIT_SHA}" + else + echo "Syncing whole repo" + fi ls -la python3 -m pip install uv uv run ./scripts/github_to_s3.py --bucket-path ${{inputs.destination-location}} --local-path ${{inputs.local-path}} \ - --document-folder ${{inputs.document-folder}} --commit-sha ${COMMIT_SHA} --sync-type ${{ inputs.sync-type }} \ + --document-folder ${{ inputs.document-folder || 'NO_DOCS' }} --commit-sha ${COMMIT_SHA} --sync-type ${{ inputs.sync-type }} \ --processes ${{ inputs.processes }} working-directory: /mnt/airflow-site-archive diff --git a/scripts/github_to_s3.py b/scripts/github_to_s3.py index 3f00f4d8cc..7760d74334 100644 --- a/scripts/github_to_s3.py +++ b/scripts/github_to_s3.py @@ -40,7 +40,7 @@ class GithubToS3(CommonTransferUtils): super().__init__(bucket, local_path) @staticmethod - def fetch_last_commit_files(commit_sha, diff_filter="ACM"): + def fetch_commit_files(commit_sha, diff_filter="ACM"): console.print(f"[blue] Fetching files from last commit {commit_sha} [/]") cmd = [ "git", @@ -61,7 +61,7 @@ class GithubToS3(CommonTransferUtils): return [] return result.stdout.splitlines() if result.stdout else [] - def sync_last_commit_files(self, commit_sha: str, processes: int): + def sync_single_commit_files(self, commit_sha: str, processes: int): ''' There are two parts here. 1. When any file gets removed under docs folder, we will remove from target location @@ -70,8 +70,8 @@ class GithubToS3(CommonTransferUtils): # Fetching `d` excludes deleted files # Fetching `D` includes deleted files - files_cp_required = self.fetch_last_commit_files(commit_sha, diff_filter="d") - files_del_required = self.fetch_last_commit_files(commit_sha, diff_filter="D") + files_cp_required = self.fetch_commit_files(commit_sha, diff_filter="d") + files_del_required = self.fetch_commit_files(commit_sha, diff_filter="D") files_cp_required_under_docs = [f for f in files_cp_required if f.startswith("docs-archive/")] files_del_required_required_under_docs = [f for f in files_del_required if f.startswith("docs-archive/")] @@ -102,15 +102,19 @@ class GithubToS3(CommonTransferUtils): self.run_with_pool(self.sync, pool_args, processes=processes) +def convert_short_name_to_folder_name(short_name: str): + if not short_name.startswith("apache-airflow-providers-"): + return f"apache-airflow-providers-{short_name.replace('.', '-')}" + return short_name if __name__ == "__main__": parser = argparse.ArgumentParser(description="Sync GitHub to S3") parser.add_argument("--bucket-path", required=True, help="S3 bucket name with path") parser.add_argument("--local-path", required=True, help="local path to sync") - parser.add_argument("--document-folder", help="Document folder to sync", default="") + parser.add_argument("--document-folder", help="Document folder to sync (or short provider-id)", default="") parser.add_argument("--commit-sha", help="Commit SHA to sync", default="") - parser.add_argument("--sync-type", help="Sync type", default="last_commit") + parser.add_argument("--sync-type", help="Sync type", default="single_commit") parser.add_argument("--processes", help="Number of processes", type=int, default=8) args = parser.parse_args() @@ -122,19 +126,21 @@ if __name__ == "__main__": if document_folder and document_folder != "NO_DOCS": full_local_path = Path(f"{args.local_path}/{document_folder}") + if not full_local_path.exists(): + full_local_path = Path(f"{args.local_path}/{convert_short_name_to_folder_name(document_folder)}") if full_local_path.exists(): console.print(f"[blue] Document folder {document_folder} exists in bucket {args.bucket_path}.[/]") destination = f"s3://{syncer.bucket_name}/{syncer.prefix}".rstrip("/") + "/" + document_folder - syncer.sync(source=full_local_path, destination=destination) + syncer.sync(source=full_local_path.as_posix(), destination=destination) sys.exit(0) else: - console.print(f"[red] Document folder {document_folder} does not exist in github {args.local_path}.[/]") + console.print(f"[red] Document folder {full_local_path} does not exist.[/]") sys.exit(1) - if args.sync_type == "last_commit" and args.commit_sha: + if args.sync_type == "single_commit" and args.commit_sha: console.print(f"[blue] Syncing last commit {args.commit_sha} from {args.local_path} [/]") - syncer.sync_last_commit_files(args.commit_sha, processes=int(args.processes)) + syncer.sync_single_commit_files(args.commit_sha, processes=int(args.processes)) sys.exit(0) if args.sync_type == "full_sync": diff --git a/scripts/transfer_utils.py b/scripts/transfer_utils.py index 132ff00bd2..bf73d0f6dd 100644 --- a/scripts/transfer_utils.py +++ b/scripts/transfer_utils.py @@ -63,7 +63,7 @@ class CommonTransferUtils: console.print(f"[yellow] Error: {e}[/]") return [] - def sync(self, source, destination): + def sync(self, source: str, destination: str): console.print(f"[blue] Syncing {source} to {destination} [/]")