(beam) branch master updated: Add ability to run performance regression checks on Beam IO Load tests. (#29226)

ahmedabualsaud Wed, 01 Nov 2023 09:50:13 -0700

This is an automated email from the ASF dual-hosted git repository.

ahmedabualsaud pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git



The following commit(s) were added to refs/heads/master by this push:
     new ea8596f2df0 Add ability to run performance regression checks on Beam 
IO Load tests. (#29226)
ea8596f2df0 is described below

commit ea8596f2df0e3e4b9da9f215ae6745c2ddfb6612
Author: Pranav Bhandari <bhandari.prana...@gmail.com>
AuthorDate: Wed Nov 1 12:50:01 2023 -0400

    Add ability to run performance regression checks on Beam IO Load tests. 
(#29226)
---
 .../testing/analyzers/io_tests_config.yaml         | 256 +++++++++++++++++++++
 .../testing/analyzers/load_test_perf_analysis.py   |  98 ++++++++
 .../testing/analyzers/perf_analysis_utils.py       |  11 +-
 3 files changed, 364 insertions(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/testing/analyzers/io_tests_config.yaml 
b/sdks/python/apache_beam/testing/analyzers/io_tests_config.yaml
new file mode 100644
index 00000000000..2a33ae31797
--- /dev/null
+++ b/sdks/python/apache_beam/testing/analyzers/io_tests_config.yaml
@@ -0,0 +1,256 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+spanner_io_read:
+  test_description: |
+    SpannerIO Read test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testSpannerWriteAndRead,read-spanner
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+spanner_io_read_runnerV2:
+  test_description: |
+    SpannerIO RunnerV2 Read test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testSpannerWriteAndRead,read_spanner_v2
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+spanner_io_write:
+  test_description: |
+    SpannerIO write test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testSpannerWriteAndRead,write-spanner
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+spanner_io_write_runnerV2:
+  test_description: |
+    SpannerIO RunnerV2 write test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testSpannerWriteAndRead,write_spanner_v2
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+bigquery_io_storage_api_read:
+  test_description: |
+    BigQueryIO Storage write API read test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testStorageAPIWriteThenRead,read-bigquery
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+bigquery_io_storage_api_read_runnerV2:
+  test_description: |
+    BigQueryIO RunnerV2 Storage write API read test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testStorageAPIWriteThenRead,read_bigquery_v2
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+bigquery_io_storage_api_write:
+  test_description: |
+    BigQueryIO Storage write API write test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testStorageAPIWriteThenRead,write-bigquery
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+bigquery_io_storage_api_write_runnerV2:
+  test_description: |
+    BigQueryIO Storage write API write test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testStorageAPIWriteThenRead,write_bigquery_v2
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+bigquery_io_avro_file_loads_read:
+  test_description: |
+    BigQueryIO Avro file loads read test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testAvroFileLoadsWriteThenRead,read-bigquery
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+bigquery_io_avro_file_loads_read_runnerV2:
+  test_description: |
+    BigQueryIO RunnerV2 Avro file loads read test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testAvroFileLoadsWriteThenRead,read_bigquery_v2
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+bigquery_io_avro_file_loads_write:
+  test_description: |
+    BigQueryIO Avro file loads write test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testAvroFileLoadsWriteThenRead,write-bigquery
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+bigquery_io_avro_file_loads_write_runnerV2:
+  test_description: |
+    BigQueryIO RunnerV2 Avro file loads write test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testAvroFileLoadsWriteThenRead,write_bigquery_v2
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+bigquery_io_json_file_loads_read:
+  test_description: |
+    BigQueryIO Json file loads read test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testJsonFileLoadsWriteThenRead,read-bigquery
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+bigquery_io_json_file_loads_write:
+  test_description: |
+    BigQueryIO Json file loads write test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testJsonFileLoadsWriteThenRead,write-bigquery
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+bigtable_io_read:
+  test_description: |
+    BigTableIO read test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testBigtableWriteAndRead,read-bigtable
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+bigtable_io_write:
+  test_description: |
+    BigTableIO write test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testBigtableWriteAndRead,write-bigtable
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+text_io_read:
+  test_description: |
+    TextIO read test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testTextIOWriteThenRead,read-textio
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+text_io_read_runnerV2:
+  test_description: |
+    TextIO RunnerV2 read test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testTextIOWriteThenRead,read_textio_v2
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+text_io_write:
+  test_description: |
+    TextIO write test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testTextIOWriteThenRead,write-textio
+  metric_name:
+    - RunTime
+    - EstimatedCost
+
+text_io_write_runnerV2:
+  test_description: |
+    TextIO RunnerV2 write test 100 GB.
+  project: apache-beam-testing
+  metrics_dataset: performance_tests
+  metrics_table: io_performance_metrics
+  # test_name is in the format testName,pipelineName
+  test_name: testTextIOWriteThenRead,write_textio_v2
+  metric_name:
+    - RunTime
+    - EstimatedCost
diff --git 
a/sdks/python/apache_beam/testing/analyzers/load_test_perf_analysis.py 
b/sdks/python/apache_beam/testing/analyzers/load_test_perf_analysis.py
new file mode 100644
index 00000000000..ee9d04e6260
--- /dev/null
+++ b/sdks/python/apache_beam/testing/analyzers/load_test_perf_analysis.py
@@ -0,0 +1,98 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import argparse
+import logging
+
+from apache_beam.testing.analyzers import constants
+from apache_beam.testing.analyzers import perf_analysis
+from apache_beam.testing.analyzers import perf_analysis_utils
+from apache_beam.testing.analyzers.perf_analysis_utils import MetricContainer
+from apache_beam.testing.analyzers.perf_analysis_utils import 
TestConfigContainer
+
+try:
+  from google.cloud import bigquery
+except ImportError:
+  bigquery = None  # type: ignore
+
+
+class LoadTestMetricsFetcher(perf_analysis_utils.MetricsFetcher):
+  """
+    Metrics fetcher used to get metric data from a BigQuery table. The metrics
+    are fetched and returned as a dataclass containing lists of timestamps and
+    metric_values.
+    """
+  def fetch_metric_data(
+      self, *, test_config: TestConfigContainer) -> MetricContainer:
+    if test_config.test_name:
+      test_name, pipeline_name = test_config.test_name.split(',')
+    else:
+      raise Exception("test_name not provided in config.")
+
+    query = f"""
+      SELECT timestamp, metric.value
+      FROM 
{test_config.project}.{test_config.metrics_dataset}.{test_config.metrics_table}
+      CROSS JOIN UNNEST(metrics) AS metric
+      WHERE test_name = "{test_name}" AND pipeline_name = "{pipeline_name}" 
AND metric.name = "{test_config.metric_name}"
+      ORDER BY timestamp DESC
+      LIMIT {constants._NUM_DATA_POINTS_TO_RUN_CHANGE_POINT_ANALYSIS}
+    """
+    logging.debug("Running query: %s" % query)
+    if bigquery is None:
+      raise ImportError('Bigquery dependencies are not installed.')
+    client = bigquery.Client()
+    query_job = client.query(query=query)
+    metric_data = query_job.result().to_dataframe()
+    if metric_data.empty:
+      logging.error(
+          "No results returned from BigQuery. Please check the query.")
+    return MetricContainer(
+        values=metric_data['value'].tolist(),
+        timestamps=metric_data['timestamp'].tolist(),
+    )
+
+
+if __name__ == '__main__':
+  logging.basicConfig(level=logging.INFO)
+  load_test_metrics_fetcher = LoadTestMetricsFetcher()
+
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--config_file_path',
+      required=True,
+      type=str,
+      help='Path to the config file that contains data to run the Change Point 
'
+      'Analysis.The default file will used will be '
+      'apache_beam/testing/analyzers/tests.config.yml. '
+      'If you would like to use the Change Point Analysis for finding '
+      'performance regression in the tests, '
+      'please provide an .yml file in the same structure as the above '
+      'mentioned file. ')
+  parser.add_argument(
+      '--save_alert_metadata',
+      action='store_true',
+      default=False,
+      help='Save perf alert/ GH Issue metadata to BigQuery table.')
+  known_args, unknown_args = parser.parse_known_args()
+
+  if unknown_args:
+    logging.warning('Discarding unknown arguments : %s ' % unknown_args)
+
+  perf_analysis.run(
+      big_query_metrics_fetcher=load_test_metrics_fetcher,
+      config_file_path=known_args.config_file_path,
+      # Set this to true while running in production.
+      save_alert_metadata=known_args.save_alert_metadata)
diff --git a/sdks/python/apache_beam/testing/analyzers/perf_analysis_utils.py 
b/sdks/python/apache_beam/testing/analyzers/perf_analysis_utils.py
index 11b1cc18ca5..a9015d715e9 100644
--- a/sdks/python/apache_beam/testing/analyzers/perf_analysis_utils.py
+++ b/sdks/python/apache_beam/testing/analyzers/perf_analysis_utils.py
@@ -28,13 +28,18 @@ from typing import Union
 import pandas as pd
 import yaml
 from google.api_core import exceptions
-from google.cloud import bigquery
 
 from apache_beam.testing.analyzers import constants
 from apache_beam.testing.load_tests import load_test_metrics_utils
 from apache_beam.testing.load_tests.load_test_metrics_utils import 
BigQueryMetricsPublisher
 from signal_processing_algorithms.energy_statistics.energy_statistics import 
e_divisive
 
+# pylint: disable=ungrouped-imports
+try:
+  from google.cloud import bigquery
+except ImportError:
+  bigquery = None  # type: ignore
+
 
 @dataclass(frozen=True)
 class GitHubIssueMetaData:
@@ -118,6 +123,8 @@ def get_existing_issues_data(table_name: str) -> 
Optional[pd.DataFrame]:
   LIMIT 10
   """
   try:
+    if bigquery is None:
+      raise ImportError('Bigquery dependencies are not installed.')
     client = bigquery.Client()
     query_job = client.query(query=query)
     existing_issue_data = query_job.result().to_dataframe()
@@ -354,6 +361,8 @@ class BigQueryMetricsFetcher(MetricsFetcher):
           ORDER BY {load_test_metrics_utils.SUBMIT_TIMESTAMP_LABEL} DESC
           LIMIT {constants._NUM_DATA_POINTS_TO_RUN_CHANGE_POINT_ANALYSIS}
         """
+    if bigquery is None:
+      raise ImportError('Bigquery dependencies are not installed.')
     client = bigquery.Client()
     query_job = client.query(query=query)
     metric_data = query_job.result().to_dataframe()

(beam) branch master updated: Add ability to run performance regression checks on Beam IO Load tests. (#29226)

Reply via email to