Re: [PR] Added BigQuery Metastore Catalog [iceberg-python]

via GitHub Sat, 28 Jun 2025 13:01:48 -0700


jayceslesar commented on code in PR #2068:
URL: https://github.com/apache/iceberg-python/pull/2068#discussion_r2173510100



##########
pyiceberg/catalog/bigquery_metastore.py:
##########
@@ -0,0 +1,419 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+from typing import TYPE_CHECKING, Any, List, Optional, Set, Tuple, Union
+
+from google.api_core.exceptions import NotFound
+from google.cloud.bigquery import Client, Dataset, DatasetReference, 
TableReference
+from google.cloud.bigquery import Table as BQTable
+from google.cloud.bigquery.external_config import 
ExternalCatalogDatasetOptions, ExternalCatalogTableOptions
+from google.cloud.bigquery.schema import SerDeInfo, StorageDescriptor
+from google.cloud.exceptions import Conflict
+from google.oauth2 import service_account
+
+from pyiceberg.catalog import WAREHOUSE_LOCATION, MetastoreCatalog, 
PropertiesUpdateSummary
+from pyiceberg.exceptions import NamespaceAlreadyExistsError, 
NoSuchNamespaceError, NoSuchTableError, TableAlreadyExistsError
+from pyiceberg.io import load_file_io
+from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
+from pyiceberg.schema import Schema
+from pyiceberg.serializers import FromInputFile
+from pyiceberg.table import CommitTableResponse, Table
+from pyiceberg.table.locations import load_location_provider
+from pyiceberg.table.metadata import TableMetadata, new_table_metadata
+from pyiceberg.table.snapshots import TOTAL_DATA_FILES, TOTAL_FILE_SIZE, 
TOTAL_RECORDS
+from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
+from pyiceberg.table.update import TableRequirement, TableUpdate
+from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties
+from pyiceberg.utils.config import Config
+
+if TYPE_CHECKING:
+    import pyarrow as pa
+
+GCP_PROJECT_ID = "gcp.project-id"
+GCP_LOCATION = "gcp.location"
+GCP_CREDENTIALS_LOCATION = "gcp.credentials-location"
+GCP_CREDENTIALS_INFO = "gcp.credentials-info"
+
+METADATA_LOCATION_PROP = "metadata_location"
+PREVIOUS_METADATA_LOCATION_PROP = "previous_metadata_location"
+TABLE_TYPE_PROP = "table_type"
+ICEBERG_TABLE_TYPE_VALUE = "ICEBERG"
+
+HIVE_SERIALIZATION_LIBRARY = "org.apache.iceberg.mr.hive.HiveIcebergSerDe"
+HIVE_FILE_INPUT_FORMAT = "org.apache.iceberg.mr.hive.HiveIcebergInputFormat"
+HIVE_FILE_OUTPUT_FORMAT = "org.apache.iceberg.mr.hive.HiveIcebergOutputFormat"
+
+
+class BigQueryMetastoreCatalog(MetastoreCatalog):
+    def __init__(self, name: str, **properties: str):
+        super().__init__(name, **properties)
+
+        project_id: Optional[str] = self.properties.get(GCP_PROJECT_ID)
+        location: Optional[str] = self.properties.get(GCP_LOCATION)
+        credentials_location: Optional[str] = 
self.properties.get(GCP_CREDENTIALS_LOCATION)
+        credentials_info_str: Optional[str] = 
self.properties.get(GCP_CREDENTIALS_INFO)
+
+        if not project_id:
+            raise ValueError(f"Missing property: {GCP_PROJECT_ID}")
+
+        # BigQuery requires current-snapshot-id to be present for tables to be 
created.
+        if not Config().get_bool("legacy-current-snapshot-id"):
+            raise ValueError("legacy-current-snapshot-id must be enabled to 
work with BigQuery.")
+
+        gcp_credentials = None
+        if credentials_location:
+            gcp_credentials = 
service_account.Credentials.from_service_account_file(credentials_location)
+        elif credentials_info_str:
+            try:

Review Comment:
   `credentials_location` will take precedence if the user specifies both -- do 
we want to allow a user to specify both? idrk



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Added BigQuery Metastore Catalog [iceberg-python]

Reply via email to