Re: [PR] feat(bigquery): add GEOGRAPHY type support for BigQuery I/O [beam]

via GitHub Sat, 13 Sep 2025 21:12:24 -0700


gemini-code-assist[bot] commented on code in PR #36121:
URL: https://github.com/apache/beam/pull/36121#discussion_r2347001826



##########
sdks/python/apache_beam/io/gcp/bigquery.py:
##########
@@ -1028,6 +1028,16 @@ def __init__(
     self._step_name = step_name
     self._source_uuid = unique_id
 
+  def _get_project(self):
+    """Returns the project that queries and exports will be billed to."""
+    if self.pipeline_options:
+      project = self.pipeline_options.view_as(GoogleCloudOptions).project
+      if isinstance(project, vp.ValueProvider):
+        project = project.get()
+      if project:
+        return project
+    return self.project

Review Comment:
   ![medium](https://www.gstatic.com/codereviewagent/medium-priority.svg)
   
   This new `_get_project` method duplicates logic from the existing 
`_get_parent_project` method. The `_get_parent_project` method is also more 
robust as it handles `self.temp_table`. To improve maintainability and reduce 
code duplication, could you remove this new method and use 
`_get_parent_project` at the call site in the `split` method instead?
   
   For example, you could change lines 1173-1174 to:
   ```python
         if not self.table_reference.projectId:
           self.table_reference.projectId = self._get_parent_project()
   ```



##########
sdks/python/apache_beam/io/gcp/bigquery_geography_it_test.py:
##########
@@ -0,0 +1,539 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Integration tests for BigQuery GEOGRAPHY data type support."""
+
+import logging
+import secrets
+import time
+import unittest
+
+import hamcrest as hc
+import pytest
+
+import apache_beam as beam
+from apache_beam.io.gcp.bigquery import ReadFromBigQuery
+from apache_beam.io.gcp.bigquery import WriteToBigQuery
+from apache_beam.io.gcp.bigquery_tools import BigQueryWrapper
+from apache_beam.io.gcp.internal.clients import bigquery
+from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultMatcher
+from apache_beam.testing.test_pipeline import TestPipeline
+from apache_beam.testing.util import assert_that
+from apache_beam.testing.util import equal_to
+
+try:
+  from apitools.base.py.exceptions import HttpError
+except ImportError:
+  HttpError = None
+
+_LOGGER = logging.getLogger(__name__)
+
+
[email protected](HttpError is None, 'GCP dependencies are not installed')
+class BigQueryGeographyIntegrationTests(unittest.TestCase):
+  """Integration tests for BigQuery GEOGRAPHY data type."""
+
+  BIG_QUERY_DATASET_ID = 'python_geography_it_test_'
+
+  def setUp(self):
+    self.test_pipeline = TestPipeline(is_integration_test=True)
+    self.runner_name = type(self.test_pipeline.runner).__name__
+    self.project = self.test_pipeline.get_option('project')
+
+    self.bigquery_client = BigQueryWrapper()
+    self.dataset_id = '%s%d%s' % (
+        self.BIG_QUERY_DATASET_ID, int(time.time()), secrets.token_hex(3))
+    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
+    _LOGGER.info(
+        "Created dataset %s in project %s", self.dataset_id, self.project)
+
+  def tearDown(self):
+    request = bigquery.BigqueryDatasetsDeleteRequest(
+        projectId=self.project, datasetId=self.dataset_id, deleteContents=True)
+    try:
+      _LOGGER.info(
+          "Deleting dataset %s in project %s", self.dataset_id, self.project)
+      self.bigquery_client.client.datasets.Delete(request)
+    except HttpError:
+      _LOGGER.debug(
+          'Failed to clean up dataset %s in project %s',
+          self.dataset_id,
+          self.project)
+
+  def create_geography_table(self, table_name, include_repeated=False):
+    """Create a table with various GEOGRAPHY field configurations."""
+    table_schema = bigquery.TableSchema()
+
+    # ID field
+    id_field = bigquery.TableFieldSchema()
+    id_field.name = 'id'
+    id_field.type = 'INTEGER'
+    id_field.mode = 'REQUIRED'
+    table_schema.fields.append(id_field)
+
+    # Required GEOGRAPHY field
+    geo_required = bigquery.TableFieldSchema()
+    geo_required.name = 'location'
+    geo_required.type = 'GEOGRAPHY'
+    geo_required.mode = 'REQUIRED'
+    table_schema.fields.append(geo_required)
+
+    # Nullable GEOGRAPHY field
+    geo_nullable = bigquery.TableFieldSchema()
+    geo_nullable.name = 'optional_location'
+    geo_nullable.type = 'GEOGRAPHY'
+    geo_nullable.mode = 'NULLABLE'
+    table_schema.fields.append(geo_nullable)
+
+    if include_repeated:
+      # Repeated GEOGRAPHY field
+      geo_repeated = bigquery.TableFieldSchema()
+      geo_repeated.name = 'path'
+      geo_repeated.type = 'GEOGRAPHY'
+      geo_repeated.mode = 'REPEATED'
+      table_schema.fields.append(geo_repeated)
+
+    table = bigquery.Table(
+        tableReference=bigquery.TableReference(
+            projectId=self.project,
+            datasetId=self.dataset_id,
+            tableId=table_name),
+        schema=table_schema)
+    request = bigquery.BigqueryTablesInsertRequest(
+        projectId=self.project, datasetId=self.dataset_id, table=table)
+    self.bigquery_client.client.tables.Insert(request)
+
+    # Wait for table to be available
+    _ = self.bigquery_client.get_table(
+        self.project, self.dataset_id, table_name)
+
+  @pytest.mark.it_postcommit
+  def test_geography_write_and_read_basic_geometries(self):
+    """Test writing and reading basic GEOGRAPHY geometries."""
+    table_name = 'geography_basic_geometries'
+    table_id = '{}.{}'.format(self.dataset_id, table_name)
+
+    # Test data with various WKT geometry types
+    input_data = [
+        {
+            'id': 1,
+            'location': 'POINT(30 10)',
+            'optional_location': ('POINT(-122.4194 37.7749)')  # San Francisco
+        },
+        {
+            'id': 2,
+            'location': 'LINESTRING(30 10, 10 30, 40 40)',
+            'optional_location': None
+        },
+        {
+            'id': 3,
+            'location': ('POLYGON((30 10, 40 40, 20 40, 10 20, 30 10))'),
+            'optional_location': ('POLYGON((0 0, 0 1, 1 1, 1 0, 0 0))')
+        },
+        {
+            'id': 4,
+            'location': ('MULTIPOINT((10 40), (40 30), (20 20), (30 10))'),
+            'optional_location': 'POINT(0 0)'
+        },
+        {
+            'id': 5,
+            'location': (
+                'MULTILINESTRING((10 10, 20 20, 10 40), '
+                '(40 40, 30 30, 40 20, 30 10))'),
+            'optional_location': None
+        }
+    ]
+
+    table_schema = {
+        "fields": [{
+            "name": "id", "type": "INTEGER", "mode": "REQUIRED"
+        }, {
+            "name": "location", "type": "GEOGRAPHY", "mode": "REQUIRED"
+        },
+                   {
+                       "name": "optional_location",
+                       "type": "GEOGRAPHY",
+                       "mode": "NULLABLE"
+                   }]
+    }
+
+    # Write data to BigQuery
+    with TestPipeline(is_integration_test=True) as p:
+      _ = (
+          p
+          | 'CreateData' >> beam.Create(input_data)
+          | 'WriteToBQ' >> WriteToBigQuery(
+              table=table_id,
+              schema=table_schema,
+              method=WriteToBigQuery.Method.STREAMING_INSERTS,
+              project=self.project))
+
+    # Read data back and verify
+    with TestPipeline(is_integration_test=True) as p:
+      result = (
+          p
+          | 'ReadFromBQ' >> ReadFromBigQuery(
+              table=table_id,
+              project=self.project,
+              method=ReadFromBigQuery.Method.DIRECT_READ)
+          | 'ExtractGeography' >> beam.Map(
+              lambda row:
+              (row['id'], row['location'], row['optional_location'])))
+
+      expected_data = [
+          (1, 'POINT(30 10)', 'POINT(-122.4194 37.7749)'),
+          (2, 'LINESTRING(30 10, 10 30, 40 40)', None),
+          (
+              3,
+              'POLYGON((30 10, 40 40, 20 40, 10 20, 30 10))',
+              'POLYGON((0 0, 0 1, 1 1, 1 0, 0 0))'),
+          (4, 'MULTIPOINT(20 20, 10 40, 40 30, 30 10)', 'POINT(0 0)'),
+          (
+              5,
+              'MULTILINESTRING((10 10, 20 20, 10 40), '
+              '(40 40, 30 30, 40 20, 30 10))',
+              None)
+      ]
+
+      assert_that(result, equal_to(expected_data))
+
+  @pytest.mark.it_postcommit
+  def test_geography_write_with_beam_rows(self):
+    """Test writing GEOGRAPHY data using Beam Rows with GeographyType."""
+    table_name = 'geography_beam_rows'
+    table_id = '{}.{}'.format(self.dataset_id, table_name)
+
+    # Create the table first
+    self.create_geography_table(table_name)
+
+    # Create Beam Rows with GeographyType
+    row_elements = [
+        beam.Row(id=1, location='POINT(1 1)', optional_location='POINT(2 2)'),
+        beam.Row(
+            id=2, location='LINESTRING(0 0, 1 1, 2 2)', 
optional_location=None),
+        beam.Row(
+            id=3,
+            location='POLYGON((0 0, 0 1, 1 1, 1 0, 0 0))',
+            optional_location='POINT(3 3)')
+    ]
+
+    # Expected data for verification
+    expected_data = [(1, 'POINT(1 1)', 'POINT(2 2)'),
+                     (2, 'LINESTRING(0 0, 1 1, 2 2)', None),
+                     (3, 'POLYGON((0 0, 0 1, 1 1, 1 0, 0 0))', 'POINT(3 3)')]
+
+    pipeline_verifiers = [
+        BigqueryFullResultMatcher(
+            project=self.project,
+            query=(
+                "SELECT id, location, optional_location FROM %s ORDER BY id" %
+                table_id),
+            data=expected_data)
+    ]
+
+    args = self.test_pipeline.get_full_options_as_args()
+
+    with beam.Pipeline(argv=args) as p:
+      _ = (
+          p
+          | 'CreateRows' >> beam.Create(row_elements)
+          | 'ConvertToDict' >> beam.Map(
+              lambda row: {
+                  'id': row.id, 'location': row.location,
+                  'optional_location': row.optional_location
+              })

Review Comment:
   ![medium](https://www.gstatic.com/codereviewagent/medium-priority.svg)
   
   The `ConvertToDict` step appears to be unnecessary here. `WriteToBigQuery` 
can directly handle a `PCollection` of `beam.Row` objects if the PCollection 
has a schema, which `beam.Create` should infer for `row_elements`. Removing 
this conversion would simplify the pipeline.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(bigquery): add GEOGRAPHY type support for BigQuery I/O [beam]

Reply via email to