This is an automated email from the ASF dual-hosted git repository. ahmedabualsaud pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push: new ea8596f2df0 Add ability to run performance regression checks on Beam IO Load tests. (#29226) ea8596f2df0 is described below commit ea8596f2df0e3e4b9da9f215ae6745c2ddfb6612 Author: Pranav Bhandari <bhandari.prana...@gmail.com> AuthorDate: Wed Nov 1 12:50:01 2023 -0400 Add ability to run performance regression checks on Beam IO Load tests. (#29226) --- .../testing/analyzers/io_tests_config.yaml | 256 +++++++++++++++++++++ .../testing/analyzers/load_test_perf_analysis.py | 98 ++++++++ .../testing/analyzers/perf_analysis_utils.py | 11 +- 3 files changed, 364 insertions(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/analyzers/io_tests_config.yaml b/sdks/python/apache_beam/testing/analyzers/io_tests_config.yaml new file mode 100644 index 00000000000..2a33ae31797 --- /dev/null +++ b/sdks/python/apache_beam/testing/analyzers/io_tests_config.yaml @@ -0,0 +1,256 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +spanner_io_read: + test_description: | + SpannerIO Read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testSpannerWriteAndRead,read-spanner + metric_name: + - RunTime + - EstimatedCost + +spanner_io_read_runnerV2: + test_description: | + SpannerIO RunnerV2 Read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testSpannerWriteAndRead,read_spanner_v2 + metric_name: + - RunTime + - EstimatedCost + +spanner_io_write: + test_description: | + SpannerIO write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testSpannerWriteAndRead,write-spanner + metric_name: + - RunTime + - EstimatedCost + +spanner_io_write_runnerV2: + test_description: | + SpannerIO RunnerV2 write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testSpannerWriteAndRead,write_spanner_v2 + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_storage_api_read: + test_description: | + BigQueryIO Storage write API read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testStorageAPIWriteThenRead,read-bigquery + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_storage_api_read_runnerV2: + test_description: | + BigQueryIO RunnerV2 Storage write API read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testStorageAPIWriteThenRead,read_bigquery_v2 + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_storage_api_write: + test_description: | + BigQueryIO Storage write API write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testStorageAPIWriteThenRead,write-bigquery + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_storage_api_write_runnerV2: + test_description: | + BigQueryIO Storage write API write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testStorageAPIWriteThenRead,write_bigquery_v2 + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_avro_file_loads_read: + test_description: | + BigQueryIO Avro file loads read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testAvroFileLoadsWriteThenRead,read-bigquery + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_avro_file_loads_read_runnerV2: + test_description: | + BigQueryIO RunnerV2 Avro file loads read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testAvroFileLoadsWriteThenRead,read_bigquery_v2 + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_avro_file_loads_write: + test_description: | + BigQueryIO Avro file loads write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testAvroFileLoadsWriteThenRead,write-bigquery + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_avro_file_loads_write_runnerV2: + test_description: | + BigQueryIO RunnerV2 Avro file loads write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testAvroFileLoadsWriteThenRead,write_bigquery_v2 + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_json_file_loads_read: + test_description: | + BigQueryIO Json file loads read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testJsonFileLoadsWriteThenRead,read-bigquery + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_json_file_loads_write: + test_description: | + BigQueryIO Json file loads write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testJsonFileLoadsWriteThenRead,write-bigquery + metric_name: + - RunTime + - EstimatedCost + +bigtable_io_read: + test_description: | + BigTableIO read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testBigtableWriteAndRead,read-bigtable + metric_name: + - RunTime + - EstimatedCost + +bigtable_io_write: + test_description: | + BigTableIO write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testBigtableWriteAndRead,write-bigtable + metric_name: + - RunTime + - EstimatedCost + +text_io_read: + test_description: | + TextIO read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testTextIOWriteThenRead,read-textio + metric_name: + - RunTime + - EstimatedCost + +text_io_read_runnerV2: + test_description: | + TextIO RunnerV2 read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testTextIOWriteThenRead,read_textio_v2 + metric_name: + - RunTime + - EstimatedCost + +text_io_write: + test_description: | + TextIO write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testTextIOWriteThenRead,write-textio + metric_name: + - RunTime + - EstimatedCost + +text_io_write_runnerV2: + test_description: | + TextIO RunnerV2 write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testTextIOWriteThenRead,write_textio_v2 + metric_name: + - RunTime + - EstimatedCost diff --git a/sdks/python/apache_beam/testing/analyzers/load_test_perf_analysis.py b/sdks/python/apache_beam/testing/analyzers/load_test_perf_analysis.py new file mode 100644 index 00000000000..ee9d04e6260 --- /dev/null +++ b/sdks/python/apache_beam/testing/analyzers/load_test_perf_analysis.py @@ -0,0 +1,98 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import argparse +import logging + +from apache_beam.testing.analyzers import constants +from apache_beam.testing.analyzers import perf_analysis +from apache_beam.testing.analyzers import perf_analysis_utils +from apache_beam.testing.analyzers.perf_analysis_utils import MetricContainer +from apache_beam.testing.analyzers.perf_analysis_utils import TestConfigContainer + +try: + from google.cloud import bigquery +except ImportError: + bigquery = None # type: ignore + + +class LoadTestMetricsFetcher(perf_analysis_utils.MetricsFetcher): + """ + Metrics fetcher used to get metric data from a BigQuery table. The metrics + are fetched and returned as a dataclass containing lists of timestamps and + metric_values. + """ + def fetch_metric_data( + self, *, test_config: TestConfigContainer) -> MetricContainer: + if test_config.test_name: + test_name, pipeline_name = test_config.test_name.split(',') + else: + raise Exception("test_name not provided in config.") + + query = f""" + SELECT timestamp, metric.value + FROM {test_config.project}.{test_config.metrics_dataset}.{test_config.metrics_table} + CROSS JOIN UNNEST(metrics) AS metric + WHERE test_name = "{test_name}" AND pipeline_name = "{pipeline_name}" AND metric.name = "{test_config.metric_name}" + ORDER BY timestamp DESC + LIMIT {constants._NUM_DATA_POINTS_TO_RUN_CHANGE_POINT_ANALYSIS} + """ + logging.debug("Running query: %s" % query) + if bigquery is None: + raise ImportError('Bigquery dependencies are not installed.') + client = bigquery.Client() + query_job = client.query(query=query) + metric_data = query_job.result().to_dataframe() + if metric_data.empty: + logging.error( + "No results returned from BigQuery. Please check the query.") + return MetricContainer( + values=metric_data['value'].tolist(), + timestamps=metric_data['timestamp'].tolist(), + ) + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + load_test_metrics_fetcher = LoadTestMetricsFetcher() + + parser = argparse.ArgumentParser() + parser.add_argument( + '--config_file_path', + required=True, + type=str, + help='Path to the config file that contains data to run the Change Point ' + 'Analysis.The default file will used will be ' + 'apache_beam/testing/analyzers/tests.config.yml. ' + 'If you would like to use the Change Point Analysis for finding ' + 'performance regression in the tests, ' + 'please provide an .yml file in the same structure as the above ' + 'mentioned file. ') + parser.add_argument( + '--save_alert_metadata', + action='store_true', + default=False, + help='Save perf alert/ GH Issue metadata to BigQuery table.') + known_args, unknown_args = parser.parse_known_args() + + if unknown_args: + logging.warning('Discarding unknown arguments : %s ' % unknown_args) + + perf_analysis.run( + big_query_metrics_fetcher=load_test_metrics_fetcher, + config_file_path=known_args.config_file_path, + # Set this to true while running in production. + save_alert_metadata=known_args.save_alert_metadata) diff --git a/sdks/python/apache_beam/testing/analyzers/perf_analysis_utils.py b/sdks/python/apache_beam/testing/analyzers/perf_analysis_utils.py index 11b1cc18ca5..a9015d715e9 100644 --- a/sdks/python/apache_beam/testing/analyzers/perf_analysis_utils.py +++ b/sdks/python/apache_beam/testing/analyzers/perf_analysis_utils.py @@ -28,13 +28,18 @@ from typing import Union import pandas as pd import yaml from google.api_core import exceptions -from google.cloud import bigquery from apache_beam.testing.analyzers import constants from apache_beam.testing.load_tests import load_test_metrics_utils from apache_beam.testing.load_tests.load_test_metrics_utils import BigQueryMetricsPublisher from signal_processing_algorithms.energy_statistics.energy_statistics import e_divisive +# pylint: disable=ungrouped-imports +try: + from google.cloud import bigquery +except ImportError: + bigquery = None # type: ignore + @dataclass(frozen=True) class GitHubIssueMetaData: @@ -118,6 +123,8 @@ def get_existing_issues_data(table_name: str) -> Optional[pd.DataFrame]: LIMIT 10 """ try: + if bigquery is None: + raise ImportError('Bigquery dependencies are not installed.') client = bigquery.Client() query_job = client.query(query=query) existing_issue_data = query_job.result().to_dataframe() @@ -354,6 +361,8 @@ class BigQueryMetricsFetcher(MetricsFetcher): ORDER BY {load_test_metrics_utils.SUBMIT_TIMESTAMP_LABEL} DESC LIMIT {constants._NUM_DATA_POINTS_TO_RUN_CHANGE_POINT_ANALYSIS} """ + if bigquery is None: + raise ImportError('Bigquery dependencies are not installed.') client = bigquery.Client() query_job = client.query(query=query) metric_data = query_job.result().to_dataframe()