This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch consistency-changes-for-syncing-scripts
in repository https://gitbox.apache.org/repos/asf/airflow-site-archive.git

commit 320a3712dc51e21b5388a1adbd40a7b9a4c10ad6
Author: Jarek Potiuk <ja...@potiuk.com>
AuthorDate: Thu May 15 22:44:25 2025 -0400

    Add consistency changes for the syncing scripts and workflows
    
    * folders -> packages (which is consistent with breeze naming)
    * more logical parameters for GH -> S3 sync (full sync boolean,
      better sequence of parameters)
    * commit-sha -> commit-ref (because it might be not only sha)
---
 .github/workflows/github-to-s3.yml | 114 +++++++++++++++++++------------------
 .github/workflows/s3-to-github.yml |  54 +++++++++---------
 README.md                          |  39 ++++---------
 scripts/github_to_s3.py            |  81 +++++++++++++-------------
 scripts/s3_to_github.py            |  36 ++++++------
 scripts/transfer_utils.py          |  42 +++++++-------
 6 files changed, 176 insertions(+), 190 deletions(-)

diff --git a/.github/workflows/github-to-s3.yml 
b/.github/workflows/github-to-s3.yml
index e02de60a30..385248eb39 100644
--- a/.github/workflows/github-to-s3.yml
+++ b/.github/workflows/github-to-s3.yml
@@ -28,23 +28,20 @@ on: # yamllint disable-line rule:truthy
           - live
           - staging
         default: live
-      sync-type:
-        description: "Perform a full sync or just sync the last commit"
+      document-packages:
+        description: "Packages (long or short) separated with spaces"
         required: false
-        default: "single_commit"
-        type: choice
-        options:
-          - single_commit
-          - full_sync
-      commit-sha:
-        description: "For single-commit - SHA/tag/branch (default: latest in 
selected branch)"
-        required: false
-        default: ""
+        default: "all"
         type: string
-      document-folders:
-        description: "For full-sync, you can select which packages to upload - 
space separated"
+      full-sync:
+        description: "If specified, whole repo will be synced (not only single 
commit)."
         required: false
-        default: "all"
+        default: false
+        type: boolean
+      commit-reference:
+        description: "Commit used to sync if np full-sync (default HEAD of 
branch chosen)."
+        required: false
+        default: "HEAD"
         type: string
       processes:
         description: "Number of processes to use for syncing"
@@ -59,24 +56,31 @@ jobs:
       - name: Summarize parameters
         id: parameters
         env:
-          DOCUMENT_FOLDERS: ${{ inputs.document-folders }}
-          SYNC_TYPE: ${{ inputs.sync-type }}
-          COMMIT_SHA: ${{ inputs.commit-sha }}
+          DOCUMENT_PACKAGES: ${{ inputs.document-packages }}
           PROCESSES: ${{ inputs.processes }}
           DESTINATION: ${{ inputs.destination }}}
+          COMMIT_REFERENCE: ${{ inputs.commit-reference }}
+          FULL_SYNC: ${{ inputs.full-sync }}
         run: |
           echo "Input parameters summary"
           echo "========================="
-          echo "Document folders: ${DOCUMENT_FOLDER}"
-          echo "Sync type: ${SYNC_TYPE}"
-          echo "Commit SHA: ${COMMIT_SHA}"
-          echo "Processes: ${PROCESSES}"
           echo "Destination: ${DESTINATION}"
+          echo "Document packages: ${DOCUMENT_PACKAGES}"
+          echo "Full sync: ${FULL_SYNC}"
+          echo "Commit reference: ${COMMIT_REFERENCE}"
+          echo "Processes: ${PROCESSES}"
+          set -x
           if [[ "${DESTINATION}"  == "live" ]]; then
              echo 
"destination-location=s3://live-docs-airflow-apache-org/docs/" >> 
${GITHUB_OUTPUT}
           else
              echo 
"destination-location=s3://staging-docs-airflow-apache-org/docs/" >> 
${GITHUB_OUTPUT}
           fi
+          if [[ "${FULL_SYNC}" == "true" ]] ; then
+            echo "sync-type=full-sync" >> ${GITHUB_OUTPUT}
+          else
+            echo "sync-type=single-commit" >> ${GITHUB_OUTPUT}
+          fi
+          echo "commit-ref=${{ input.full-sync && '' || env.COMMIT_REFERENCE 
}}" >> ${GITHUB_OUTPUT}
 
       - uses: actions/checkout@v4
         # Checkout only workflow and scripts directory to run scripts from
@@ -121,53 +125,53 @@ jobs:
           sudo chown -R "${USER}" /mnt/cloned-airflow-site-archive
           ln -v -s /mnt/cloned-airflow-site-archive 
./cloned-airflow-site-archive
 
-      - name: Pre-process docs folders
+      - name: Pre-process docs packages
         env:
-          DOCUMENTS_FOLDERS: ${{ inputs.document-folders }}
-        id: docs-folders-processed
+          DOCUMENTS_PACKAGES: ${{ inputs.document-packages }}
+        id: docs-packages-processed
         run: |
           echo "sparse-checkout<<EOF" >> ${GITHUB_OUTPUT}
-          if [[ "${DOCUMENTS_FOLDERS}" != "all" ]]; then
-            echo "Preprocessing docs folders: ${DOCUMENTS_FOLDERS}"
-            folders=""
+          if [[ "${DOCUMENTS_PACKAGES}" != "all" ]]; then
+            echo "Preprocessing docs packages: ${DOCUMENTS_PACKAGES}"
+            packages=""
             sparse_checkout=""
             separator=""
-            for folder in ${DOCUMENTS_FOLDERS}; do
-              if [[ "${folder}" != apache-airflow-providers* ]]; then
-                
folders="${folders}${separator}apache-airflow-providers-${folder/./-}"
-                echo "docs-archive/apache-airflow-providers-${folder/./-}" >> 
${GITHUB_OUTPUT}
+            for package in ${DOCUMENTS_PACKAGES}; do
+              if [[ "${package}" != apache-airflow-providers* ]]; then
+                
packages="${packages}${separator}apache-airflow-providers-${package/./-}"
+                echo "docs-archive/apache-airflow-providers-${package/./-}" >> 
${GITHUB_OUTPUT}
               else
-                folders="${folders}${separator}${folder}"
-                echo "docs-archive/${folder}" >> ${GITHUB_OUTPUT}
+                packages="${packages}${separator}${package}"
+                echo "docs-archive/${package}" >> ${GITHUB_OUTPUT}
               fi
               separator=" "
             done
           else
-            folders="all"
+            packages="all"
             echo "docs-archive" >> ${GITHUB_OUTPUT}
           fi
           echo "EOF" >> ${GITHUB_OUTPUT}
-          echo "docs-folders-processed=${folders}"
-          echo "docs-folders-processed=${folders}" >> ${GITHUB_OUTPUT}
+          set -x
+          echo "docs-packages-processed=${packages}" >> ${GITHUB_OUTPUT}
       - name: >
-          Checkout (${{  inputs.commit-sha || github.sha }}) to 
/mnt/cloned-airflow-site-archive
-          with docs: ${{ 
steps.docs-folders-processed.outputs.docs-folders-processed }}
+          Checkout (${{  steps.parameters.outputs.commit-ref }}) to 
/mnt/cloned-airflow-site-archive
+          with docs: ${{ 
steps.docs-packages-processed.outputs.docs-packages-processed }}
         uses: actions/checkout@v4
         with:
           path: ./cloned-airflow-site-archive
           fetch-depth: 2
           sparse-checkout: |
-            ${{ steps.docs-folders-processed.outputs.sparse-checkout }}
-          ref: ${{ inputs.commit-sha || github.sha }}
-        if: steps.docs-folders-processed.outputs.docs-folders-processed != 
'all'
+            ${{ steps.docs-packages-processed.outputs.sparse-checkout }}
+          ref: ${{ steps.parameters.outputs.commit-ref }}
+        if: steps.docs-packages-processed.outputs.docs-packages-processed != 
'all'
       - name: >
-          Checkout (${{  inputs.commit-sha || github.sha }}) to 
/mnt/cloned-airflow-site-archive (whole repo)
+          Checkout (${{  steps.parameters.outputs.commit-ref }}) to 
/mnt/cloned-airflow-site-archive (whole repo)
         uses: actions/checkout@v4
         with:
           path: ./cloned-airflow-site-archive
           fetch-depth: 2
-          ref: ${{ inputs.commit-sha || github.sha }}
-        if: steps.docs-folders-processed.outputs.docs-folders-processed == 
'all'
+          ref: ${{ steps.parameters.outputs.commit-ref }}
+        if: steps.docs-packages-processed.outputs.docs-packages-processed == 
'all'
       - name: "Install uv"
         run: curl -LsSf https://astral.sh/uv/install.sh | sh
       - name: Add staging watermarks
@@ -177,24 +181,24 @@ jobs:
           chmod a+x add_watermark.py
           mkdir -p images
           curl -sSf -o images/staging.png 
https://raw.githubusercontent.com/apache/airflow-site/refs/heads/main/.github/scripts/images/staging.png
-          uv run add_watermark.py --pattern 'main.min.*.css' --folder 
docs-archive \
+          uv run add_watermark.py --pattern 'main.min.*.css' --package 
docs-archive \
           --image-directory images --url-prefix /docs/images
         if: inputs.destination == 'staging'
       - name: >
-          Syncing ${{ inputs.commit-sha || github.sha }}: ${{ 
inputs.destination }}:
-          ${{ inputs.sync-type }} ${{ 
steps.docs-folders-processed.outputs.docs-folders-processed }}
-          wih parallel aws cli methods = ${{ inputs.processes }}
+          Syncing ${{ steps.parameters.outputs.commit-ref }}: ${{ 
inputs.destination }}:
+          ${{ steps.parameters.outputs.sync-type }} ${{ 
steps.docs-packages-processed.outputs.docs-packages-processed }}
+          wih parallel aws cli = ${{ inputs.processes }}
         env:
-          COMMIT_SHA: ${{ inputs.commit-sha || github.sha }}
-          SYNC_TYPE: ${{ inputs.sync-type }}
-          PROCESSES: ${{ inputs.processes }}
-          DOCUMENTS_FOLDERS: ${{ 
steps.docs-folders-processed.outputs.docs-folders-processed }}
+          COMMIT_REF: ${{ steps.parameters.outputs.commit-ref }}
+          SYNC_TYPE: ${{ steps.parameters.outputs.sync-type }}
+          DOCUMENTS_PACKAGES: ${{ 
steps.docs-packages-processed.outputs.docs-packages-processed }}
           DESTINATION_LOCATION: ${{ 
steps.parameters.outputs.destination-location }}
+          PROCESSES: ${{ inputs.processes }}
         run: |
           # show what's being run
           set -x
           if [[ "${SYNC_TYPE}" == "single_commit" ]]; then
-            echo "Syncing ${COMMIT_SHA}"
+            echo "Syncing ${COMMIT_REF}"
           else
             echo "Syncing whole repo"
           fi
@@ -205,6 +209,6 @@ jobs:
           uv run ./scripts/github_to_s3.py \
             --bucket-path ${DESTINATION_LOCATION} \
             --local-path /mnt/cloned-airflow-site-archive/docs-archive \
-            --document-folders "${DOCUMENTS_FOLDERS}" \
-            --commit-sha ${COMMIT_SHA} --sync-type ${SYNC_TYPE} \
+            --document-packages "${DOCUMENTS_PACKAGES}" \
+            --commit-ref ${COMMIT_REF} --sync-type ${SYNC_TYPE} \
             --processes ${PROCESSES}
diff --git a/.github/workflows/s3-to-github.yml 
b/.github/workflows/s3-to-github.yml
index d4650ff7cf..db3c8f6017 100644
--- a/.github/workflows/s3-to-github.yml
+++ b/.github/workflows/s3-to-github.yml
@@ -28,8 +28,8 @@ on: # yamllint disable-line rule:truthy
           - live
           - staging
         default: live
-      document-folders:
-        description: "Document folders to sync or short package ids (separated 
with spaces)"
+      document-packages:
+        description: "Document packages - long or short seprated by spaces"
         required: false
         default: "all"
         type: string
@@ -49,7 +49,7 @@ jobs:
     runs-on: ubuntu-latest
     env:
       SOURCE: ${{ inputs.source }}
-      DOCUMENT_FOLDERS: ${{ inputs.document-folders }}
+      DOCUMENT_PACKAGES: ${{ inputs.document-packages }}
       COMMIT_CHANGES: ${{ inputs.commit-changes }}
       PROCESSES: ${{ inputs.processes }}
     steps:
@@ -59,7 +59,7 @@ jobs:
           echo "Input parameters summary"
           echo "========================="
           echo "Source: ${SOURCE}"
-          echo "Document folders: ${DOCUMENT_FOLDERS}"
+          echo "Document packages: ${DOCUMENT_PACKAGES}"
           echo "Commit changes: ${COMMIT_CHANGES}"
           echo "Processes: ${PROCESSES}"
           if [[ "${SOURCE}"  == "live" ]]; then
@@ -86,37 +86,37 @@ jobs:
           aws-secret-access-key: ${{ secrets.DOCS_AWS_SECRET_ACCESS_KEY }}
           aws-region: us-east-2
 
-      - name: Pre-process docs folders
+      - name: Pre-process docs packages
         env:
-          DOCUMENTS_FOLDERS: ${{ inputs.document-folders }}
-        id: docs-folders-processed
+          DOCUMENT_PACKAGES: ${{ inputs.document-packages }}
+        id: docs-packages-processed
         run: |
           echo "sparse-checkout<<EOF" >> ${GITHUB_OUTPUT}
           echo ".github" >> ${GITHUB_OUTPUT}
           echo "scripts" >> ${GITHUB_OUTPUT}
-          if [[ "${DOCUMENTS_FOLDERS}" != "all" ]]; then
-            echo "Preprocessing docs folders: ${DOCUMENTS_FOLDERS}"
-            folders=""
+          if [[ "${DOCUMENT_PACKAGES}" != "all" ]]; then
+            echo "Preprocessing docs packages: ${DOCUMENT_PACKAGES}"
+            packages=""
             sparse_checkout=""
             separator=""
-            for folder in ${DOCUMENTS_FOLDERS}; do
-              if [[ "${folder}" != apache-airflow-providers* && "${folder}" != 
"apache-airflow" \
-                 && "${folder}" != "docker-stack" && "${folder}" != 
"helm-chart" ]]; then
-                
folders="${folders}${separator}apache-airflow-providers-${folder/./-}"
-                echo "docs-archive/apache-airflow-providers-${folder/./-}" >> 
${GITHUB_OUTPUT}
+            for package in ${DOCUMENT_PACKAGES}; do
+              if [[ "${package}" != apache-airflow-providers* && "${package}" 
!= "apache-airflow" \
+                 && "${package}" != "docker-stack" && "${package}" != 
"helm-chart" ]]; then
+                
packages="${packages}${separator}apache-airflow-providers-${package/./-}"
+                echo "docs-archive/apache-airflow-providers-${package/./-}" >> 
${GITHUB_OUTPUT}
               else
-                folders="${folders}${separator}${folder}"
-                echo "docs-archive/${folder}" >> ${GITHUB_OUTPUT}
+                packages="${packages}${separator}${package}"
+                echo "docs-archive/${package}" >> ${GITHUB_OUTPUT}
               fi
               separator=" "
             done
           else
-            folders="all"
+            packages="all"
             echo "docs-archive" >> ${GITHUB_OUTPUT}
           fi
           echo "EOF" >> ${GITHUB_OUTPUT}
-          echo "docs-folders-processed=${folders}"
-          echo "docs-folders-processed=${folders}" >> ${GITHUB_OUTPUT}
+          echo "docs-packages-processed=${packages}"
+          echo "docs-packages-processed=${packages}" >> ${GITHUB_OUTPUT}
 
       - name: Create /mnt/cloned-airflow-site-archive directory
         run: |
@@ -125,14 +125,14 @@ jobs:
 
       - name: >
           Checkout (${{  inputs.commit-sha || github.sha }}) to 
/mnt/cloned-airflow-site-archive
-          with docs: ${{ 
steps.docs-folders-processed.outputs.docs-folders-processed }}
+          with docs: ${{ 
steps.docs-packages-processed.outputs.docs-packages-processed }}
         uses: actions/checkout@v4
         with:
           path: ./cloned-airflow-site-archive
           fetch-depth: 1
           sparse-checkout: |
-            ${{ steps.docs-folders-processed.outputs.sparse-checkout }}
-        if: steps.docs-folders-processed.outputs.docs-folders-processed != 
'all'
+            ${{ steps.docs-packages-processed.outputs.sparse-checkout }}
+        if: steps.docs-packages-processed.outputs.docs-packages-processed != 
'all'
 
       - name: >
           Checkout (${{  inputs.commit-sha || github.sha }}) to 
/mnt/cloned-airflow-site-archive (whole repo)
@@ -140,15 +140,15 @@ jobs:
         with:
           path: ./cloned-airflow-site-archive
           fetch-depth: 1
-        if: steps.docs-folders-processed.outputs.docs-folders-processed == 
'all'
+        if: steps.docs-packages-processed.outputs.docs-packages-processed == 
'all'
 
       - name: "Check space available"
         run: df -h
 
-      - name: Syncing ${{ inputs.source }} (${{ inputs.document-folders }})
+      - name: Syncing ${{ inputs.source }} (${{ inputs.document-packages }})
         env:
           PROCESSES: ${{ inputs.processes }}
-          DOCUMENTS_FOLDERS: ${{ inputs.document-folders }}
+          DOCUMENT_PACKAGES: ${{ inputs.document-packages }}
           SOURCE_LOCATION: ${{ steps.parameters.outputs.source-location }}
         run: |
           set -x
@@ -156,7 +156,7 @@ jobs:
           uv run ./scripts/s3_to_github.py \
             --bucket-path ${SOURCE_LOCATION} \
             --local-path ./docs-archive \
-            --document-folders "${DOCUMENT_FOLDERS}" \
+            --document-packages "${DOCUMENT_PACKAGES}" \
             --processes "${PROCESSES}"
         working-directory: /mnt/cloned-airflow-site-archive
 
diff --git a/README.md b/README.md
index 25526b0a26..3f134b8bd3 100644
--- a/README.md
+++ b/README.md
@@ -1,32 +1,13 @@
-# airflow-site-archive
+# Airflow sync archive
 
-### Documentation Syncing Process
-### S3 To GitHub
-**Sync S3 to Github**: Use the `scripts/s3_to_github.py` script to download 
the latest documentation from S3 to your ./docs-archive folder.
-It has the following command line arguments:
-- `--bucket-path`: The S3 bucket path where the documentation is stored.
-- `--local-path`: The local path where the documentation will be downloaded.
-- `--document-folder`: The folder in the S3 bucket where the documentation is 
stored (This is optional if any particular
-                      folder need to be synced, provide the folder name ex: 
`apache-airflow-providers-amazon`).
-```bash
-uv run ./scripts/s3_to_github.py --bucket-path 
s3://staging-docs-airflow-apache-org/docs/ --local-path ./docs-archive
-```
+The repository stores the archive of generated documentation from Apache 
Airflow.
 
+The scripts and workflows here allow to keep the repository in sync with the 
S3 buckets - both live and
+sync - wehre the documentation is stored. Sync in both direction is possible.
 
-### GitHub To S3
-**Sync Github to S3**: Use the `scripts/github_to_s3.py` script to upload the 
latest documentation from your ./docs-archive folder to S3.
-It has two modes:
-1. **Last commit**: Syncs only last commit changes to S3.
-2. **Full sync**: Syncs all files under `./docs-archive` to S3.
-It has the following command line arguments:
-
-- `--bucket-path`: The S3 bucket path where the documentation will be stored.
-- `--local-path`: The local path where the documentation is stored.
-- `--document-folder`: The folder in the local path where the documentation is 
stored (This is optional if any particular
-                      folder need to be synced, provide the folder name ex: 
`apache-airflow-providers-amazon`).
-- `--sync-type`: The type of sync to perform. Can be either `last_commit` or 
`full_sync`.
-- `--commit-sha`: The commit sha to sync to S3. This is only required if the 
sync type is `last_commit`.
-
-```bash
-uv run ./scripts/github_to_s3.py --bucket-path 
s3://staging-docs-airflow-apache-org/docs/ --local-path ./docs-archive 
--sync-type last-commit
-```
+In the future we will automate synchronization of the repoitory after any 
change to the buckets, currently
+manual synchronization S3 -> Bucket for the `live` ucket documentation is done 
using the `S3 to GitHub workflow`
+that subsequently uses `s3-to-github.py`, and syncing the repository to the 
`staging` bucket is done
+using the `GitHub to S3 workflow` that uses `github-to-s3.py` script. The 
scripts can also be used to
+perform manual syncs of changes when we modify the documentation in the 
repository and want to
+sync it to either of the S3 buckets.
diff --git a/scripts/github_to_s3.py b/scripts/github_to_s3.py
index 4b78226d54..e9c51f0efa 100644
--- a/scripts/github_to_s3.py
+++ b/scripts/github_to_s3.py
@@ -31,7 +31,7 @@ from pathlib import Path
 
 from rich.console import Console
 
-from transfer_utils import CommonTransferUtils, 
convert_short_name_to_folder_name, sort_priority_folders
+from transfer_utils import CommonTransferUtils, 
convert_short_name_to_full_package_name, sort_priority_packages
 
 console = Console(width=200, color_system="standard")
 
@@ -40,16 +40,16 @@ class GithubToS3(CommonTransferUtils):
         super().__init__(bucket, local_path)
 
     @staticmethod
-    def fetch_commit_files(commit_sha: str, diff_filter: str="ACM"):
-        console.print(f"[blue] Fetching files from last commit {commit_sha} 
[/]")
+    def fetch_commit_files(commit_ref: str, diff_filter: str="ACM"):
+        console.print(f"[blue] Fetching files from last commit {commit_ref} 
[/]")
         cmd = [
             "git",
             "diff-tree",
             "--no-commit-id",
             "--name-only",
             "-r",
-            commit_sha + "^",
-            commit_sha,
+            commit_ref + "^",
+            commit_ref,
             f"--diff-filter={diff_filter}"
         ]
         result = subprocess.run(cmd, check=False, capture_output=True, 
text=True)
@@ -61,17 +61,17 @@ class GithubToS3(CommonTransferUtils):
             sys.exit(1)
         return result.stdout.splitlines() if result.stdout else []
 
-    def sync_single_commit_files(self, commit_sha: str, processes: int):
+    def sync_single_commit_files(self, commit_ref: str, processes: int):
         '''
         There are two parts here.
-        1. When any file gets removed under docs folder, we will remove from 
target location
-        2. When any file gets added/changed/modified under docs folder, we 
will copy from source to target location
+        1. When any file gets removed under docs package, we will remove from 
target location
+        2. When any file gets added/changed/modified under docs package, we 
will copy from source to target location
         '''
         # Fetching `d` excludes deleted files
         # Fetching `D` includes deleted files
 
-        files_cp_required = self.fetch_commit_files(commit_sha, 
diff_filter="d")
-        files_del_required = self.fetch_commit_files(commit_sha, 
diff_filter="D")
+        files_cp_required = self.fetch_commit_files(commit_ref, 
diff_filter="d")
+        files_del_required = self.fetch_commit_files(commit_ref, 
diff_filter="D")
 
         files_cp_required_under_docs = [f for f in files_cp_required if 
f.startswith("docs-archive/")]
         files_del_required_required_under_docs = [f for f in 
files_del_required if f.startswith("docs-archive/")]
@@ -91,16 +91,16 @@ class GithubToS3(CommonTransferUtils):
         self.run_with_pool(self.remove, delete_files_pool_args, 
processes=processes)
         self.run_with_pool(self.copy, copy_files_pool_args, 
processes=processes)
 
-    def full_sync(self, processes: int, folders: list[str] | None = None):
-        if folders:
-            console.print(f"[blue] Syncing folders {folders} from 
{self.local_path} to {self.bucket_name} [/]")
+    def full_sync(self, processes: int, packages: list[str] | None = None):
+        if packages:
+            console.print(f"[blue] Syncing packages {packages} from 
{self.local_path} to {self.bucket_name} [/]")
         else:
             console.print(f"[blue] Syncing all files from {self.local_path} to 
{self.bucket_name} [/]")
-        list_of_folders = os.listdir(self.local_path) if not folders else 
folders
+        list_of_packages = os.listdir(self.local_path) if not packages else 
packages
         pool_args = []
-        for folder in sort_priority_folders(list_of_folders):
-            source = os.path.join(self.local_path, folder)
-            dest = f"s3://{self.bucket_name}/{self.prefix}".rstrip("/") + "/" 
+ folder
+        for package in sort_priority_packages(list_of_packages):
+            source = os.path.join(self.local_path, package)
+            dest = f"s3://{self.bucket_name}/{self.prefix}".rstrip("/") + "/" 
+ package
             pool_args.append((source, dest))
 
         self.run_with_pool(self.sync, pool_args, processes=processes)
@@ -111,11 +111,12 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Sync GitHub to S3")
     parser.add_argument("--bucket-path", required=True, help="S3 bucket name 
with path")
     parser.add_argument("--local-path", required=True, help="local path to 
sync")
-    parser.add_argument("--document-folders", help="Document folders to sync "
+    parser.add_argument("--document-packages", help="Document packages to sync 
"
                                                    "(or short provider-ids) 
separated with spaces "
-                                                   "('all' means all 
folders)", default="")
-    parser.add_argument("--commit-sha", help="Commit SHA to sync", default="")
-    parser.add_argument("--sync-type", help="Sync type", 
default="single_commit")
+                                                   "('all' means all 
packages)", default="")
+    parser.add_argument("--commit-ref", help="Commit ref to sync 
(sha/HEAD/branch)", default="")
+    parser.add_argument("--sync-type", help="Sync type", choices=('full-sync', 
'single-commit'),
+                        default="single-commit")
     parser.add_argument("--processes", help="Number of processes", type=int, 
default=8)
 
     args = parser.parse_args()
@@ -123,36 +124,36 @@ if __name__ == "__main__":
     syncer = GithubToS3(bucket=args.bucket_path, local_path=args.local_path)
     syncer.check_bucket()
 
-    document_folders = args.document_folders
+    document_packages = args.document_packages
     # Make sure you are in the right directory for git commands
     os.chdir(Path(args.local_path).parent.as_posix())
     # Force color
     os.environ["FORCE_COLOR"] = "1"
 
-    if document_folders != "all" and args.sync_type == "single_commit":
-        console.print(f"[red] Invalid folder name {document_folders} for sync 
type {args.sync_type} - only "
-                      f"all can be used with single_commit[/]")
+    if document_packages != "all" and args.sync_type == "single-commit":
+        console.print(f"[red] Invalid package name {document_packages} for 
sync type {args.sync_type} - only "
+                      f"all can be used with single-commit[/]")
         sys.exit(1)
 
-    if document_folders and document_folders != "all" and args.sync_type == 
"full_sync":
-        folders_to_sync = []
-        for _folder in document_folders.split(" "):
-            full_local_path = Path(f"{args.local_path}/{_folder}")
+    if document_packages and document_packages != "all" and args.sync_type == 
"full-sync":
+        packages_to_sync = []
+        for _package in document_packages.split(" "):
+            full_local_path = Path(f"{args.local_path}/{_package}")
             if not full_local_path.exists():
-                full_local_path = 
Path(f"{args.local_path}/{convert_short_name_to_folder_name(_folder)}")
+                full_local_path = 
Path(f"{args.local_path}/{convert_short_name_to_full_package_name(_package)}")
             if full_local_path.exists():
-                console.print(f"[blue] Document folder {_folder} exists in 
bucket {args.bucket_path}.[/]")
-                folders_to_sync.append(_folder)
+                console.print(f"[blue] Document package {_package} exists in 
bucket {args.bucket_path}.[/]")
+                packages_to_sync.append(_package)
             else:
-                console.print(f"[red] Document folder {full_local_path} does 
not exist.[/]")
+                console.print(f"[red] Document package {full_local_path} does 
not exist.[/]")
                 sys.exit(1)
-        syncer.full_sync(processes=int(args.processes), 
folders=folders_to_sync)
-    elif args.sync_type == "full_sync":
+        syncer.full_sync(processes=int(args.processes), 
packages=packages_to_sync)
+    elif args.sync_type == "full-sync":
         syncer.full_sync(processes=int(args.processes))
-    elif args.sync_type == "single_commit" and args.commit_sha and 
document_folders == "all":
-        console.print(f"[blue] Syncing last commit {args.commit_sha} from 
{args.local_path} [/]")
-        syncer.sync_single_commit_files(args.commit_sha, 
processes=int(args.processes))
+    elif args.sync_type == "single-commit" and args.commit_ref and 
document_packages == "all":
+        console.print(f"[blue] Syncing last commit {args.commit_ref} from 
{args.local_path} [/]")
+        syncer.sync_single_commit_files(args.commit_ref, 
processes=int(args.processes))
     else:
-        console.print(f"[red] Invalid sync type {args.sync_type} with document 
folders {document_folders} "
-                      f"and commit sha {args.commit_sha}[/]")
+        console.print(f"[red] Invalid sync type {args.sync_type} with document 
packages {document_packages} "
+                      f"and commit ref {args.commit_ref}[/]")
         sys.exit(1)
diff --git a/scripts/s3_to_github.py b/scripts/s3_to_github.py
index 0eac7ffb10..f780ffd7b4 100644
--- a/scripts/s3_to_github.py
+++ b/scripts/s3_to_github.py
@@ -29,7 +29,7 @@ import sys
 
 from rich.console import Console
 
-from transfer_utils import CommonTransferUtils, 
convert_short_name_to_folder_name, sort_priority_folders, \
+from transfer_utils import CommonTransferUtils, 
convert_short_name_to_full_package_name, sort_priority_packages, \
     sort_priority_tuples
 
 console = Console(width=200, color_system="standard")
@@ -40,18 +40,18 @@ class S3TOGithub(CommonTransferUtils):
     def __init__(self, bucket:str , local_path: str):
         super().__init__(bucket, local_path)
 
-    def verify_document_folder(self, document_folder: str):
+    def verify_document_package(self, document_package: str):
         response= self.s3_client.list_objects_v2(
             Bucket=self.bucket_name,
-            Prefix=self.prefix.rstrip("/") + "/" + document_folder,
+            Prefix=self.prefix.rstrip("/") + "/" + document_package,
         )
         return response["KeyCount"] > 0
 
-    def sync_s3_to_github(self, processes: int, folders: list[str] | None = 
None,
+    def sync_s3_to_github(self, processes: int, packages: list[str] | None = 
None,
                           remote_prefix: str = "docs/"):
         console.print("[blue] Syncing files from S3 to GitHub...[/]")
-        prefixes = self.get_list_of_folders() if not folders else [
-            f"{remote_prefix}{folder}" for folder in folders
+        prefixes = self.get_list_of_packages() if not packages else [
+            f"{remote_prefix}{package}" for package in packages
         ]
         pool_args = []
         for pref in prefixes:
@@ -70,31 +70,31 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Sync S3 to GitHub")
     parser.add_argument("--bucket-path", required=True, help="S3 bucket name 
with path")
     parser.add_argument("--local-path", required=True, help="local path to 
sync")
-    parser.add_argument("--document-folders", help="Document folders to sync", 
default="all")
+    parser.add_argument("--document-packages", help="Document packages to 
sync", default="all")
     parser.add_argument("--processes", help="Number of processes", type=int, 
default=8)
 
     args = parser.parse_args()
 
     syncer = S3TOGithub(bucket=args.bucket_path, local_path=args.local_path)
     syncer.check_bucket()
-    _document_folders = args.document_folders
+    _document_packages = args.document_packages
     # Make sure you are in the right directory for git commands
     os.chdir(Path(args.local_path).parent.as_posix())
     # Force color
     os.environ["FORCE_COLOR"] = "1"
 
-    if _document_folders and _document_folders != "all":
-        folders_to_sync = []
-        for _folder in _document_folders.split(" "):
-            full_folder_name = convert_short_name_to_folder_name(_folder)
-            if syncer.verify_document_folder(full_folder_name):
-                console.print(f"[blue] Document folder {full_folder_name} 
exists in bucket {args.bucket_path}.[/]")
-                folders_to_sync.append(full_folder_name)
+    if _document_packages and _document_packages != "all":
+        packages_to_sync = []
+        for _package in _document_packages.split(" "):
+            full_package_name = 
convert_short_name_to_full_package_name(_package)
+            if syncer.verify_document_package(full_package_name):
+                console.print(f"[blue] Document package {full_package_name} 
exists in bucket {args.bucket_path}.[/]")
+                packages_to_sync.append(full_package_name)
             else:
-                console.print(f"[red] Document folder {full_folder_name} does 
not exist in bucket {args.bucket_path}.[/]")
+                console.print(f"[red] Document package {full_package_name} 
does not exist in bucket {args.bucket_path}.[/]")
                 sys.exit(1)
 
-        folders_to_sync = sort_priority_folders(folders_to_sync)
-        syncer.sync_s3_to_github(processes=int(args.processes), 
folders=folders_to_sync)
+        packages_to_sync = sort_priority_packages(packages_to_sync)
+        syncer.sync_s3_to_github(processes=int(args.processes), 
packages=packages_to_sync)
     else:
         syncer.sync_s3_to_github(processes=int(args.processes))
diff --git a/scripts/transfer_utils.py b/scripts/transfer_utils.py
index 4692c8cdc8..6dd480bcdf 100644
--- a/scripts/transfer_utils.py
+++ b/scripts/transfer_utils.py
@@ -16,13 +16,13 @@ KNOWN_PACKAGES = ["apache-airflow", "helm-chart", 
"docker-stack"]
 
 console = Console(width=200, color_system="standard")
 
-def track_progress(folder: str, file_path: Path):
+def track_progress(package: str, file_path: Path):
     while file_path.exists():
         sleep(10)
         if not file_path.exists():
             break
         num_lines = file_path.read_text().count("\n")
-        console.print(f"{folder}:[blue] Processed {num_lines} files[/]")
+        console.print(f"{package}:[blue] Processed {num_lines} files[/]")
 
 
 class CommonTransferUtils:
@@ -61,8 +61,8 @@ class CommonTransferUtils:
             console.print(f"[red] Error: {e}[/]")
             sys.exit(1)
 
-    def get_list_of_folders(self) -> list[str]:
-        folders = []
+    def get_list_of_packages(self) -> list[str]:
+        packages = []
         try:
             response = self.s3_client.list_objects_v2(
                 Bucket=self.bucket_name,
@@ -71,8 +71,8 @@ class CommonTransferUtils:
             )
             if 'CommonPrefixes' in response:
                 for cur_prefix in response['CommonPrefixes']:
-                    folders.append(cur_prefix['Prefix'])
-            return sorted(folders)
+                    packages.append(cur_prefix['Prefix'])
+            return sorted(packages)
         except Exception as e:
             console.print(f"[yellow] Error: {e}[/]")
             return []
@@ -104,7 +104,7 @@ class CommonTransferUtils:
     @staticmethod
     def run_with_pool(func: Callable, args: Any, processes: int = 4):
         # Chunksize is set to 1 - otherwise map / starmap will send tasks in 
chunks
-        # and the prioritization we set for folders will be lost.
+        # and the prioritization we set for packages will be lost.
         # Our tasks are big enough to not cause overhead of sending
         # them one at a time.
         with Pool(processes=processes) as pool:
@@ -136,33 +136,33 @@ class CommonTransferUtils:
         )
         console.print(f"{file_to_delete}[green] Delete completed[/]")
 
-def convert_short_name_to_folder_name(short_name: str) -> str:
+def convert_short_name_to_full_package_name(short_name: str) -> str:
     if not short_name.startswith("apache-airflow-providers-") and short_name 
not in KNOWN_PACKAGES:
         return f"apache-airflow-providers-{short_name.replace('.', '-')}"
     return short_name
 
-# start with those folders first
-PRIORITY_FOLDERS = ["apache-airflow-providers-google", 
"apache-airflow-providers-amazon", "apache-airflow"]
+# start with those packages first
+PRIORITY_PACKAGES = ["apache-airflow-providers-google", 
"apache-airflow-providers-amazon", "apache-airflow"]
 
-def sort_priority_folders(folders: list[str]) -> list[str]:
+def sort_priority_packages(packages: list[str]) -> list[str]:
     """
-    Sort the folders in a way that the priority folders are at the top
+    Sort the packages in a way that the priority packages are at the top
     """
-    sorted_folders = []
-    for folder in PRIORITY_FOLDERS:
-        if folder in folders:
-            sorted_folders.append(folder)
-            folders.remove(folder)
-    return sorted_folders + sorted(folders)
+    sorted_packages = []
+    for package in PRIORITY_PACKAGES:
+        if package in packages:
+            sorted_packages.append(package)
+            packages.remove(package)
+    return sorted_packages + sorted(packages)
 
 def sort_priority_tuples(tuples: list[tuple[str, str]]) -> list[tuple[str, 
str]]:
     """
-    Sort the tuples in a way that the priority folders are at the top
+    Sort the tuples in a way that the priority packages are at the top
     """
     sorted_tuples = []
-    for folder in PRIORITY_FOLDERS:
+    for package in PRIORITY_PACKAGES:
         for tup in tuples:
-            if tup[0].endswith(folder +"/"):
+            if tup[0].endswith(package +"/"):
                 sorted_tuples.append(tup)
                 tuples.remove(tup)
     return sorted_tuples + sorted(tuples, key=lambda x: x[0])


Reply via email to