This is an automated email from the ASF dual-hosted git repository. potiuk pushed a commit to branch consistency-changes-for-syncing-scripts in repository https://gitbox.apache.org/repos/asf/airflow-site-archive.git
commit 320a3712dc51e21b5388a1adbd40a7b9a4c10ad6 Author: Jarek Potiuk <ja...@potiuk.com> AuthorDate: Thu May 15 22:44:25 2025 -0400 Add consistency changes for the syncing scripts and workflows * folders -> packages (which is consistent with breeze naming) * more logical parameters for GH -> S3 sync (full sync boolean, better sequence of parameters) * commit-sha -> commit-ref (because it might be not only sha) --- .github/workflows/github-to-s3.yml | 114 +++++++++++++++++++------------------ .github/workflows/s3-to-github.yml | 54 +++++++++--------- README.md | 39 ++++--------- scripts/github_to_s3.py | 81 +++++++++++++------------- scripts/s3_to_github.py | 36 ++++++------ scripts/transfer_utils.py | 42 +++++++------- 6 files changed, 176 insertions(+), 190 deletions(-) diff --git a/.github/workflows/github-to-s3.yml b/.github/workflows/github-to-s3.yml index e02de60a30..385248eb39 100644 --- a/.github/workflows/github-to-s3.yml +++ b/.github/workflows/github-to-s3.yml @@ -28,23 +28,20 @@ on: # yamllint disable-line rule:truthy - live - staging default: live - sync-type: - description: "Perform a full sync or just sync the last commit" + document-packages: + description: "Packages (long or short) separated with spaces" required: false - default: "single_commit" - type: choice - options: - - single_commit - - full_sync - commit-sha: - description: "For single-commit - SHA/tag/branch (default: latest in selected branch)" - required: false - default: "" + default: "all" type: string - document-folders: - description: "For full-sync, you can select which packages to upload - space separated" + full-sync: + description: "If specified, whole repo will be synced (not only single commit)." required: false - default: "all" + default: false + type: boolean + commit-reference: + description: "Commit used to sync if np full-sync (default HEAD of branch chosen)." + required: false + default: "HEAD" type: string processes: description: "Number of processes to use for syncing" @@ -59,24 +56,31 @@ jobs: - name: Summarize parameters id: parameters env: - DOCUMENT_FOLDERS: ${{ inputs.document-folders }} - SYNC_TYPE: ${{ inputs.sync-type }} - COMMIT_SHA: ${{ inputs.commit-sha }} + DOCUMENT_PACKAGES: ${{ inputs.document-packages }} PROCESSES: ${{ inputs.processes }} DESTINATION: ${{ inputs.destination }}} + COMMIT_REFERENCE: ${{ inputs.commit-reference }} + FULL_SYNC: ${{ inputs.full-sync }} run: | echo "Input parameters summary" echo "=========================" - echo "Document folders: ${DOCUMENT_FOLDER}" - echo "Sync type: ${SYNC_TYPE}" - echo "Commit SHA: ${COMMIT_SHA}" - echo "Processes: ${PROCESSES}" echo "Destination: ${DESTINATION}" + echo "Document packages: ${DOCUMENT_PACKAGES}" + echo "Full sync: ${FULL_SYNC}" + echo "Commit reference: ${COMMIT_REFERENCE}" + echo "Processes: ${PROCESSES}" + set -x if [[ "${DESTINATION}" == "live" ]]; then echo "destination-location=s3://live-docs-airflow-apache-org/docs/" >> ${GITHUB_OUTPUT} else echo "destination-location=s3://staging-docs-airflow-apache-org/docs/" >> ${GITHUB_OUTPUT} fi + if [[ "${FULL_SYNC}" == "true" ]] ; then + echo "sync-type=full-sync" >> ${GITHUB_OUTPUT} + else + echo "sync-type=single-commit" >> ${GITHUB_OUTPUT} + fi + echo "commit-ref=${{ input.full-sync && '' || env.COMMIT_REFERENCE }}" >> ${GITHUB_OUTPUT} - uses: actions/checkout@v4 # Checkout only workflow and scripts directory to run scripts from @@ -121,53 +125,53 @@ jobs: sudo chown -R "${USER}" /mnt/cloned-airflow-site-archive ln -v -s /mnt/cloned-airflow-site-archive ./cloned-airflow-site-archive - - name: Pre-process docs folders + - name: Pre-process docs packages env: - DOCUMENTS_FOLDERS: ${{ inputs.document-folders }} - id: docs-folders-processed + DOCUMENTS_PACKAGES: ${{ inputs.document-packages }} + id: docs-packages-processed run: | echo "sparse-checkout<<EOF" >> ${GITHUB_OUTPUT} - if [[ "${DOCUMENTS_FOLDERS}" != "all" ]]; then - echo "Preprocessing docs folders: ${DOCUMENTS_FOLDERS}" - folders="" + if [[ "${DOCUMENTS_PACKAGES}" != "all" ]]; then + echo "Preprocessing docs packages: ${DOCUMENTS_PACKAGES}" + packages="" sparse_checkout="" separator="" - for folder in ${DOCUMENTS_FOLDERS}; do - if [[ "${folder}" != apache-airflow-providers* ]]; then - folders="${folders}${separator}apache-airflow-providers-${folder/./-}" - echo "docs-archive/apache-airflow-providers-${folder/./-}" >> ${GITHUB_OUTPUT} + for package in ${DOCUMENTS_PACKAGES}; do + if [[ "${package}" != apache-airflow-providers* ]]; then + packages="${packages}${separator}apache-airflow-providers-${package/./-}" + echo "docs-archive/apache-airflow-providers-${package/./-}" >> ${GITHUB_OUTPUT} else - folders="${folders}${separator}${folder}" - echo "docs-archive/${folder}" >> ${GITHUB_OUTPUT} + packages="${packages}${separator}${package}" + echo "docs-archive/${package}" >> ${GITHUB_OUTPUT} fi separator=" " done else - folders="all" + packages="all" echo "docs-archive" >> ${GITHUB_OUTPUT} fi echo "EOF" >> ${GITHUB_OUTPUT} - echo "docs-folders-processed=${folders}" - echo "docs-folders-processed=${folders}" >> ${GITHUB_OUTPUT} + set -x + echo "docs-packages-processed=${packages}" >> ${GITHUB_OUTPUT} - name: > - Checkout (${{ inputs.commit-sha || github.sha }}) to /mnt/cloned-airflow-site-archive - with docs: ${{ steps.docs-folders-processed.outputs.docs-folders-processed }} + Checkout (${{ steps.parameters.outputs.commit-ref }}) to /mnt/cloned-airflow-site-archive + with docs: ${{ steps.docs-packages-processed.outputs.docs-packages-processed }} uses: actions/checkout@v4 with: path: ./cloned-airflow-site-archive fetch-depth: 2 sparse-checkout: | - ${{ steps.docs-folders-processed.outputs.sparse-checkout }} - ref: ${{ inputs.commit-sha || github.sha }} - if: steps.docs-folders-processed.outputs.docs-folders-processed != 'all' + ${{ steps.docs-packages-processed.outputs.sparse-checkout }} + ref: ${{ steps.parameters.outputs.commit-ref }} + if: steps.docs-packages-processed.outputs.docs-packages-processed != 'all' - name: > - Checkout (${{ inputs.commit-sha || github.sha }}) to /mnt/cloned-airflow-site-archive (whole repo) + Checkout (${{ steps.parameters.outputs.commit-ref }}) to /mnt/cloned-airflow-site-archive (whole repo) uses: actions/checkout@v4 with: path: ./cloned-airflow-site-archive fetch-depth: 2 - ref: ${{ inputs.commit-sha || github.sha }} - if: steps.docs-folders-processed.outputs.docs-folders-processed == 'all' + ref: ${{ steps.parameters.outputs.commit-ref }} + if: steps.docs-packages-processed.outputs.docs-packages-processed == 'all' - name: "Install uv" run: curl -LsSf https://astral.sh/uv/install.sh | sh - name: Add staging watermarks @@ -177,24 +181,24 @@ jobs: chmod a+x add_watermark.py mkdir -p images curl -sSf -o images/staging.png https://raw.githubusercontent.com/apache/airflow-site/refs/heads/main/.github/scripts/images/staging.png - uv run add_watermark.py --pattern 'main.min.*.css' --folder docs-archive \ + uv run add_watermark.py --pattern 'main.min.*.css' --package docs-archive \ --image-directory images --url-prefix /docs/images if: inputs.destination == 'staging' - name: > - Syncing ${{ inputs.commit-sha || github.sha }}: ${{ inputs.destination }}: - ${{ inputs.sync-type }} ${{ steps.docs-folders-processed.outputs.docs-folders-processed }} - wih parallel aws cli methods = ${{ inputs.processes }} + Syncing ${{ steps.parameters.outputs.commit-ref }}: ${{ inputs.destination }}: + ${{ steps.parameters.outputs.sync-type }} ${{ steps.docs-packages-processed.outputs.docs-packages-processed }} + wih parallel aws cli = ${{ inputs.processes }} env: - COMMIT_SHA: ${{ inputs.commit-sha || github.sha }} - SYNC_TYPE: ${{ inputs.sync-type }} - PROCESSES: ${{ inputs.processes }} - DOCUMENTS_FOLDERS: ${{ steps.docs-folders-processed.outputs.docs-folders-processed }} + COMMIT_REF: ${{ steps.parameters.outputs.commit-ref }} + SYNC_TYPE: ${{ steps.parameters.outputs.sync-type }} + DOCUMENTS_PACKAGES: ${{ steps.docs-packages-processed.outputs.docs-packages-processed }} DESTINATION_LOCATION: ${{ steps.parameters.outputs.destination-location }} + PROCESSES: ${{ inputs.processes }} run: | # show what's being run set -x if [[ "${SYNC_TYPE}" == "single_commit" ]]; then - echo "Syncing ${COMMIT_SHA}" + echo "Syncing ${COMMIT_REF}" else echo "Syncing whole repo" fi @@ -205,6 +209,6 @@ jobs: uv run ./scripts/github_to_s3.py \ --bucket-path ${DESTINATION_LOCATION} \ --local-path /mnt/cloned-airflow-site-archive/docs-archive \ - --document-folders "${DOCUMENTS_FOLDERS}" \ - --commit-sha ${COMMIT_SHA} --sync-type ${SYNC_TYPE} \ + --document-packages "${DOCUMENTS_PACKAGES}" \ + --commit-ref ${COMMIT_REF} --sync-type ${SYNC_TYPE} \ --processes ${PROCESSES} diff --git a/.github/workflows/s3-to-github.yml b/.github/workflows/s3-to-github.yml index d4650ff7cf..db3c8f6017 100644 --- a/.github/workflows/s3-to-github.yml +++ b/.github/workflows/s3-to-github.yml @@ -28,8 +28,8 @@ on: # yamllint disable-line rule:truthy - live - staging default: live - document-folders: - description: "Document folders to sync or short package ids (separated with spaces)" + document-packages: + description: "Document packages - long or short seprated by spaces" required: false default: "all" type: string @@ -49,7 +49,7 @@ jobs: runs-on: ubuntu-latest env: SOURCE: ${{ inputs.source }} - DOCUMENT_FOLDERS: ${{ inputs.document-folders }} + DOCUMENT_PACKAGES: ${{ inputs.document-packages }} COMMIT_CHANGES: ${{ inputs.commit-changes }} PROCESSES: ${{ inputs.processes }} steps: @@ -59,7 +59,7 @@ jobs: echo "Input parameters summary" echo "=========================" echo "Source: ${SOURCE}" - echo "Document folders: ${DOCUMENT_FOLDERS}" + echo "Document packages: ${DOCUMENT_PACKAGES}" echo "Commit changes: ${COMMIT_CHANGES}" echo "Processes: ${PROCESSES}" if [[ "${SOURCE}" == "live" ]]; then @@ -86,37 +86,37 @@ jobs: aws-secret-access-key: ${{ secrets.DOCS_AWS_SECRET_ACCESS_KEY }} aws-region: us-east-2 - - name: Pre-process docs folders + - name: Pre-process docs packages env: - DOCUMENTS_FOLDERS: ${{ inputs.document-folders }} - id: docs-folders-processed + DOCUMENT_PACKAGES: ${{ inputs.document-packages }} + id: docs-packages-processed run: | echo "sparse-checkout<<EOF" >> ${GITHUB_OUTPUT} echo ".github" >> ${GITHUB_OUTPUT} echo "scripts" >> ${GITHUB_OUTPUT} - if [[ "${DOCUMENTS_FOLDERS}" != "all" ]]; then - echo "Preprocessing docs folders: ${DOCUMENTS_FOLDERS}" - folders="" + if [[ "${DOCUMENT_PACKAGES}" != "all" ]]; then + echo "Preprocessing docs packages: ${DOCUMENT_PACKAGES}" + packages="" sparse_checkout="" separator="" - for folder in ${DOCUMENTS_FOLDERS}; do - if [[ "${folder}" != apache-airflow-providers* && "${folder}" != "apache-airflow" \ - && "${folder}" != "docker-stack" && "${folder}" != "helm-chart" ]]; then - folders="${folders}${separator}apache-airflow-providers-${folder/./-}" - echo "docs-archive/apache-airflow-providers-${folder/./-}" >> ${GITHUB_OUTPUT} + for package in ${DOCUMENT_PACKAGES}; do + if [[ "${package}" != apache-airflow-providers* && "${package}" != "apache-airflow" \ + && "${package}" != "docker-stack" && "${package}" != "helm-chart" ]]; then + packages="${packages}${separator}apache-airflow-providers-${package/./-}" + echo "docs-archive/apache-airflow-providers-${package/./-}" >> ${GITHUB_OUTPUT} else - folders="${folders}${separator}${folder}" - echo "docs-archive/${folder}" >> ${GITHUB_OUTPUT} + packages="${packages}${separator}${package}" + echo "docs-archive/${package}" >> ${GITHUB_OUTPUT} fi separator=" " done else - folders="all" + packages="all" echo "docs-archive" >> ${GITHUB_OUTPUT} fi echo "EOF" >> ${GITHUB_OUTPUT} - echo "docs-folders-processed=${folders}" - echo "docs-folders-processed=${folders}" >> ${GITHUB_OUTPUT} + echo "docs-packages-processed=${packages}" + echo "docs-packages-processed=${packages}" >> ${GITHUB_OUTPUT} - name: Create /mnt/cloned-airflow-site-archive directory run: | @@ -125,14 +125,14 @@ jobs: - name: > Checkout (${{ inputs.commit-sha || github.sha }}) to /mnt/cloned-airflow-site-archive - with docs: ${{ steps.docs-folders-processed.outputs.docs-folders-processed }} + with docs: ${{ steps.docs-packages-processed.outputs.docs-packages-processed }} uses: actions/checkout@v4 with: path: ./cloned-airflow-site-archive fetch-depth: 1 sparse-checkout: | - ${{ steps.docs-folders-processed.outputs.sparse-checkout }} - if: steps.docs-folders-processed.outputs.docs-folders-processed != 'all' + ${{ steps.docs-packages-processed.outputs.sparse-checkout }} + if: steps.docs-packages-processed.outputs.docs-packages-processed != 'all' - name: > Checkout (${{ inputs.commit-sha || github.sha }}) to /mnt/cloned-airflow-site-archive (whole repo) @@ -140,15 +140,15 @@ jobs: with: path: ./cloned-airflow-site-archive fetch-depth: 1 - if: steps.docs-folders-processed.outputs.docs-folders-processed == 'all' + if: steps.docs-packages-processed.outputs.docs-packages-processed == 'all' - name: "Check space available" run: df -h - - name: Syncing ${{ inputs.source }} (${{ inputs.document-folders }}) + - name: Syncing ${{ inputs.source }} (${{ inputs.document-packages }}) env: PROCESSES: ${{ inputs.processes }} - DOCUMENTS_FOLDERS: ${{ inputs.document-folders }} + DOCUMENT_PACKAGES: ${{ inputs.document-packages }} SOURCE_LOCATION: ${{ steps.parameters.outputs.source-location }} run: | set -x @@ -156,7 +156,7 @@ jobs: uv run ./scripts/s3_to_github.py \ --bucket-path ${SOURCE_LOCATION} \ --local-path ./docs-archive \ - --document-folders "${DOCUMENT_FOLDERS}" \ + --document-packages "${DOCUMENT_PACKAGES}" \ --processes "${PROCESSES}" working-directory: /mnt/cloned-airflow-site-archive diff --git a/README.md b/README.md index 25526b0a26..3f134b8bd3 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,13 @@ -# airflow-site-archive +# Airflow sync archive -### Documentation Syncing Process -### S3 To GitHub -**Sync S3 to Github**: Use the `scripts/s3_to_github.py` script to download the latest documentation from S3 to your ./docs-archive folder. -It has the following command line arguments: -- `--bucket-path`: The S3 bucket path where the documentation is stored. -- `--local-path`: The local path where the documentation will be downloaded. -- `--document-folder`: The folder in the S3 bucket where the documentation is stored (This is optional if any particular - folder need to be synced, provide the folder name ex: `apache-airflow-providers-amazon`). -```bash -uv run ./scripts/s3_to_github.py --bucket-path s3://staging-docs-airflow-apache-org/docs/ --local-path ./docs-archive -``` +The repository stores the archive of generated documentation from Apache Airflow. +The scripts and workflows here allow to keep the repository in sync with the S3 buckets - both live and +sync - wehre the documentation is stored. Sync in both direction is possible. -### GitHub To S3 -**Sync Github to S3**: Use the `scripts/github_to_s3.py` script to upload the latest documentation from your ./docs-archive folder to S3. -It has two modes: -1. **Last commit**: Syncs only last commit changes to S3. -2. **Full sync**: Syncs all files under `./docs-archive` to S3. -It has the following command line arguments: - -- `--bucket-path`: The S3 bucket path where the documentation will be stored. -- `--local-path`: The local path where the documentation is stored. -- `--document-folder`: The folder in the local path where the documentation is stored (This is optional if any particular - folder need to be synced, provide the folder name ex: `apache-airflow-providers-amazon`). -- `--sync-type`: The type of sync to perform. Can be either `last_commit` or `full_sync`. -- `--commit-sha`: The commit sha to sync to S3. This is only required if the sync type is `last_commit`. - -```bash -uv run ./scripts/github_to_s3.py --bucket-path s3://staging-docs-airflow-apache-org/docs/ --local-path ./docs-archive --sync-type last-commit -``` +In the future we will automate synchronization of the repoitory after any change to the buckets, currently +manual synchronization S3 -> Bucket for the `live` ucket documentation is done using the `S3 to GitHub workflow` +that subsequently uses `s3-to-github.py`, and syncing the repository to the `staging` bucket is done +using the `GitHub to S3 workflow` that uses `github-to-s3.py` script. The scripts can also be used to +perform manual syncs of changes when we modify the documentation in the repository and want to +sync it to either of the S3 buckets. diff --git a/scripts/github_to_s3.py b/scripts/github_to_s3.py index 4b78226d54..e9c51f0efa 100644 --- a/scripts/github_to_s3.py +++ b/scripts/github_to_s3.py @@ -31,7 +31,7 @@ from pathlib import Path from rich.console import Console -from transfer_utils import CommonTransferUtils, convert_short_name_to_folder_name, sort_priority_folders +from transfer_utils import CommonTransferUtils, convert_short_name_to_full_package_name, sort_priority_packages console = Console(width=200, color_system="standard") @@ -40,16 +40,16 @@ class GithubToS3(CommonTransferUtils): super().__init__(bucket, local_path) @staticmethod - def fetch_commit_files(commit_sha: str, diff_filter: str="ACM"): - console.print(f"[blue] Fetching files from last commit {commit_sha} [/]") + def fetch_commit_files(commit_ref: str, diff_filter: str="ACM"): + console.print(f"[blue] Fetching files from last commit {commit_ref} [/]") cmd = [ "git", "diff-tree", "--no-commit-id", "--name-only", "-r", - commit_sha + "^", - commit_sha, + commit_ref + "^", + commit_ref, f"--diff-filter={diff_filter}" ] result = subprocess.run(cmd, check=False, capture_output=True, text=True) @@ -61,17 +61,17 @@ class GithubToS3(CommonTransferUtils): sys.exit(1) return result.stdout.splitlines() if result.stdout else [] - def sync_single_commit_files(self, commit_sha: str, processes: int): + def sync_single_commit_files(self, commit_ref: str, processes: int): ''' There are two parts here. - 1. When any file gets removed under docs folder, we will remove from target location - 2. When any file gets added/changed/modified under docs folder, we will copy from source to target location + 1. When any file gets removed under docs package, we will remove from target location + 2. When any file gets added/changed/modified under docs package, we will copy from source to target location ''' # Fetching `d` excludes deleted files # Fetching `D` includes deleted files - files_cp_required = self.fetch_commit_files(commit_sha, diff_filter="d") - files_del_required = self.fetch_commit_files(commit_sha, diff_filter="D") + files_cp_required = self.fetch_commit_files(commit_ref, diff_filter="d") + files_del_required = self.fetch_commit_files(commit_ref, diff_filter="D") files_cp_required_under_docs = [f for f in files_cp_required if f.startswith("docs-archive/")] files_del_required_required_under_docs = [f for f in files_del_required if f.startswith("docs-archive/")] @@ -91,16 +91,16 @@ class GithubToS3(CommonTransferUtils): self.run_with_pool(self.remove, delete_files_pool_args, processes=processes) self.run_with_pool(self.copy, copy_files_pool_args, processes=processes) - def full_sync(self, processes: int, folders: list[str] | None = None): - if folders: - console.print(f"[blue] Syncing folders {folders} from {self.local_path} to {self.bucket_name} [/]") + def full_sync(self, processes: int, packages: list[str] | None = None): + if packages: + console.print(f"[blue] Syncing packages {packages} from {self.local_path} to {self.bucket_name} [/]") else: console.print(f"[blue] Syncing all files from {self.local_path} to {self.bucket_name} [/]") - list_of_folders = os.listdir(self.local_path) if not folders else folders + list_of_packages = os.listdir(self.local_path) if not packages else packages pool_args = [] - for folder in sort_priority_folders(list_of_folders): - source = os.path.join(self.local_path, folder) - dest = f"s3://{self.bucket_name}/{self.prefix}".rstrip("/") + "/" + folder + for package in sort_priority_packages(list_of_packages): + source = os.path.join(self.local_path, package) + dest = f"s3://{self.bucket_name}/{self.prefix}".rstrip("/") + "/" + package pool_args.append((source, dest)) self.run_with_pool(self.sync, pool_args, processes=processes) @@ -111,11 +111,12 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="Sync GitHub to S3") parser.add_argument("--bucket-path", required=True, help="S3 bucket name with path") parser.add_argument("--local-path", required=True, help="local path to sync") - parser.add_argument("--document-folders", help="Document folders to sync " + parser.add_argument("--document-packages", help="Document packages to sync " "(or short provider-ids) separated with spaces " - "('all' means all folders)", default="") - parser.add_argument("--commit-sha", help="Commit SHA to sync", default="") - parser.add_argument("--sync-type", help="Sync type", default="single_commit") + "('all' means all packages)", default="") + parser.add_argument("--commit-ref", help="Commit ref to sync (sha/HEAD/branch)", default="") + parser.add_argument("--sync-type", help="Sync type", choices=('full-sync', 'single-commit'), + default="single-commit") parser.add_argument("--processes", help="Number of processes", type=int, default=8) args = parser.parse_args() @@ -123,36 +124,36 @@ if __name__ == "__main__": syncer = GithubToS3(bucket=args.bucket_path, local_path=args.local_path) syncer.check_bucket() - document_folders = args.document_folders + document_packages = args.document_packages # Make sure you are in the right directory for git commands os.chdir(Path(args.local_path).parent.as_posix()) # Force color os.environ["FORCE_COLOR"] = "1" - if document_folders != "all" and args.sync_type == "single_commit": - console.print(f"[red] Invalid folder name {document_folders} for sync type {args.sync_type} - only " - f"all can be used with single_commit[/]") + if document_packages != "all" and args.sync_type == "single-commit": + console.print(f"[red] Invalid package name {document_packages} for sync type {args.sync_type} - only " + f"all can be used with single-commit[/]") sys.exit(1) - if document_folders and document_folders != "all" and args.sync_type == "full_sync": - folders_to_sync = [] - for _folder in document_folders.split(" "): - full_local_path = Path(f"{args.local_path}/{_folder}") + if document_packages and document_packages != "all" and args.sync_type == "full-sync": + packages_to_sync = [] + for _package in document_packages.split(" "): + full_local_path = Path(f"{args.local_path}/{_package}") if not full_local_path.exists(): - full_local_path = Path(f"{args.local_path}/{convert_short_name_to_folder_name(_folder)}") + full_local_path = Path(f"{args.local_path}/{convert_short_name_to_full_package_name(_package)}") if full_local_path.exists(): - console.print(f"[blue] Document folder {_folder} exists in bucket {args.bucket_path}.[/]") - folders_to_sync.append(_folder) + console.print(f"[blue] Document package {_package} exists in bucket {args.bucket_path}.[/]") + packages_to_sync.append(_package) else: - console.print(f"[red] Document folder {full_local_path} does not exist.[/]") + console.print(f"[red] Document package {full_local_path} does not exist.[/]") sys.exit(1) - syncer.full_sync(processes=int(args.processes), folders=folders_to_sync) - elif args.sync_type == "full_sync": + syncer.full_sync(processes=int(args.processes), packages=packages_to_sync) + elif args.sync_type == "full-sync": syncer.full_sync(processes=int(args.processes)) - elif args.sync_type == "single_commit" and args.commit_sha and document_folders == "all": - console.print(f"[blue] Syncing last commit {args.commit_sha} from {args.local_path} [/]") - syncer.sync_single_commit_files(args.commit_sha, processes=int(args.processes)) + elif args.sync_type == "single-commit" and args.commit_ref and document_packages == "all": + console.print(f"[blue] Syncing last commit {args.commit_ref} from {args.local_path} [/]") + syncer.sync_single_commit_files(args.commit_ref, processes=int(args.processes)) else: - console.print(f"[red] Invalid sync type {args.sync_type} with document folders {document_folders} " - f"and commit sha {args.commit_sha}[/]") + console.print(f"[red] Invalid sync type {args.sync_type} with document packages {document_packages} " + f"and commit ref {args.commit_ref}[/]") sys.exit(1) diff --git a/scripts/s3_to_github.py b/scripts/s3_to_github.py index 0eac7ffb10..f780ffd7b4 100644 --- a/scripts/s3_to_github.py +++ b/scripts/s3_to_github.py @@ -29,7 +29,7 @@ import sys from rich.console import Console -from transfer_utils import CommonTransferUtils, convert_short_name_to_folder_name, sort_priority_folders, \ +from transfer_utils import CommonTransferUtils, convert_short_name_to_full_package_name, sort_priority_packages, \ sort_priority_tuples console = Console(width=200, color_system="standard") @@ -40,18 +40,18 @@ class S3TOGithub(CommonTransferUtils): def __init__(self, bucket:str , local_path: str): super().__init__(bucket, local_path) - def verify_document_folder(self, document_folder: str): + def verify_document_package(self, document_package: str): response= self.s3_client.list_objects_v2( Bucket=self.bucket_name, - Prefix=self.prefix.rstrip("/") + "/" + document_folder, + Prefix=self.prefix.rstrip("/") + "/" + document_package, ) return response["KeyCount"] > 0 - def sync_s3_to_github(self, processes: int, folders: list[str] | None = None, + def sync_s3_to_github(self, processes: int, packages: list[str] | None = None, remote_prefix: str = "docs/"): console.print("[blue] Syncing files from S3 to GitHub...[/]") - prefixes = self.get_list_of_folders() if not folders else [ - f"{remote_prefix}{folder}" for folder in folders + prefixes = self.get_list_of_packages() if not packages else [ + f"{remote_prefix}{package}" for package in packages ] pool_args = [] for pref in prefixes: @@ -70,31 +70,31 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="Sync S3 to GitHub") parser.add_argument("--bucket-path", required=True, help="S3 bucket name with path") parser.add_argument("--local-path", required=True, help="local path to sync") - parser.add_argument("--document-folders", help="Document folders to sync", default="all") + parser.add_argument("--document-packages", help="Document packages to sync", default="all") parser.add_argument("--processes", help="Number of processes", type=int, default=8) args = parser.parse_args() syncer = S3TOGithub(bucket=args.bucket_path, local_path=args.local_path) syncer.check_bucket() - _document_folders = args.document_folders + _document_packages = args.document_packages # Make sure you are in the right directory for git commands os.chdir(Path(args.local_path).parent.as_posix()) # Force color os.environ["FORCE_COLOR"] = "1" - if _document_folders and _document_folders != "all": - folders_to_sync = [] - for _folder in _document_folders.split(" "): - full_folder_name = convert_short_name_to_folder_name(_folder) - if syncer.verify_document_folder(full_folder_name): - console.print(f"[blue] Document folder {full_folder_name} exists in bucket {args.bucket_path}.[/]") - folders_to_sync.append(full_folder_name) + if _document_packages and _document_packages != "all": + packages_to_sync = [] + for _package in _document_packages.split(" "): + full_package_name = convert_short_name_to_full_package_name(_package) + if syncer.verify_document_package(full_package_name): + console.print(f"[blue] Document package {full_package_name} exists in bucket {args.bucket_path}.[/]") + packages_to_sync.append(full_package_name) else: - console.print(f"[red] Document folder {full_folder_name} does not exist in bucket {args.bucket_path}.[/]") + console.print(f"[red] Document package {full_package_name} does not exist in bucket {args.bucket_path}.[/]") sys.exit(1) - folders_to_sync = sort_priority_folders(folders_to_sync) - syncer.sync_s3_to_github(processes=int(args.processes), folders=folders_to_sync) + packages_to_sync = sort_priority_packages(packages_to_sync) + syncer.sync_s3_to_github(processes=int(args.processes), packages=packages_to_sync) else: syncer.sync_s3_to_github(processes=int(args.processes)) diff --git a/scripts/transfer_utils.py b/scripts/transfer_utils.py index 4692c8cdc8..6dd480bcdf 100644 --- a/scripts/transfer_utils.py +++ b/scripts/transfer_utils.py @@ -16,13 +16,13 @@ KNOWN_PACKAGES = ["apache-airflow", "helm-chart", "docker-stack"] console = Console(width=200, color_system="standard") -def track_progress(folder: str, file_path: Path): +def track_progress(package: str, file_path: Path): while file_path.exists(): sleep(10) if not file_path.exists(): break num_lines = file_path.read_text().count("\n") - console.print(f"{folder}:[blue] Processed {num_lines} files[/]") + console.print(f"{package}:[blue] Processed {num_lines} files[/]") class CommonTransferUtils: @@ -61,8 +61,8 @@ class CommonTransferUtils: console.print(f"[red] Error: {e}[/]") sys.exit(1) - def get_list_of_folders(self) -> list[str]: - folders = [] + def get_list_of_packages(self) -> list[str]: + packages = [] try: response = self.s3_client.list_objects_v2( Bucket=self.bucket_name, @@ -71,8 +71,8 @@ class CommonTransferUtils: ) if 'CommonPrefixes' in response: for cur_prefix in response['CommonPrefixes']: - folders.append(cur_prefix['Prefix']) - return sorted(folders) + packages.append(cur_prefix['Prefix']) + return sorted(packages) except Exception as e: console.print(f"[yellow] Error: {e}[/]") return [] @@ -104,7 +104,7 @@ class CommonTransferUtils: @staticmethod def run_with_pool(func: Callable, args: Any, processes: int = 4): # Chunksize is set to 1 - otherwise map / starmap will send tasks in chunks - # and the prioritization we set for folders will be lost. + # and the prioritization we set for packages will be lost. # Our tasks are big enough to not cause overhead of sending # them one at a time. with Pool(processes=processes) as pool: @@ -136,33 +136,33 @@ class CommonTransferUtils: ) console.print(f"{file_to_delete}[green] Delete completed[/]") -def convert_short_name_to_folder_name(short_name: str) -> str: +def convert_short_name_to_full_package_name(short_name: str) -> str: if not short_name.startswith("apache-airflow-providers-") and short_name not in KNOWN_PACKAGES: return f"apache-airflow-providers-{short_name.replace('.', '-')}" return short_name -# start with those folders first -PRIORITY_FOLDERS = ["apache-airflow-providers-google", "apache-airflow-providers-amazon", "apache-airflow"] +# start with those packages first +PRIORITY_PACKAGES = ["apache-airflow-providers-google", "apache-airflow-providers-amazon", "apache-airflow"] -def sort_priority_folders(folders: list[str]) -> list[str]: +def sort_priority_packages(packages: list[str]) -> list[str]: """ - Sort the folders in a way that the priority folders are at the top + Sort the packages in a way that the priority packages are at the top """ - sorted_folders = [] - for folder in PRIORITY_FOLDERS: - if folder in folders: - sorted_folders.append(folder) - folders.remove(folder) - return sorted_folders + sorted(folders) + sorted_packages = [] + for package in PRIORITY_PACKAGES: + if package in packages: + sorted_packages.append(package) + packages.remove(package) + return sorted_packages + sorted(packages) def sort_priority_tuples(tuples: list[tuple[str, str]]) -> list[tuple[str, str]]: """ - Sort the tuples in a way that the priority folders are at the top + Sort the tuples in a way that the priority packages are at the top """ sorted_tuples = [] - for folder in PRIORITY_FOLDERS: + for package in PRIORITY_PACKAGES: for tup in tuples: - if tup[0].endswith(folder +"/"): + if tup[0].endswith(package +"/"): sorted_tuples.append(tup) tuples.remove(tup) return sorted_tuples + sorted(tuples, key=lambda x: x[0])