This is an automated email from the ASF dual-hosted git repository. altay pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push: new 41dd4bf [BEAM-7389] Update to use util.Regex transform new da6c1a8 Merge pull request #9435 from davidcavazos/regex-sample 41dd4bf is described below commit 41dd4bf8b106dcbc48146e174ca468ff90d3cdfc Author: David Cavazos <dcava...@google.com> AuthorDate: Mon Aug 26 16:38:42 2019 -0700 [BEAM-7389] Update to use util.Regex transform --- .../snippets/transforms/element_wise/regex.py | 235 ++++++++++++++------- .../snippets/transforms/element_wise/regex_test.py | 210 +++++++++++------- 2 files changed, 289 insertions(+), 156 deletions(-) diff --git a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py index 44aa9629..975d5d3 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py @@ -20,142 +20,217 @@ from __future__ import absolute_import from __future__ import print_function -def regex_match(test=None): - # [START regex_match] +def regex_matches(test=None): + # [START regex_matches] import apache_beam as beam - import re - - def parse_plant(text): - m = re.match(r'^([^\s-]+)\s*-\s*(\w+)\s*-\s*(?P<duration>\w+)$', text) - if m: - yield { - 'match': m.group(0), # contains the entire matched text - 'icon': m.group(1), # ([^\s-]+) - group - 'name': m.group(2), # (\w+) - group - 'duration': m.group('duration'), # (?P<duration>\w+) - named group - } + # Matches a named group 'icon', and then two comma-separated groups. + regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)' with beam.Pipeline() as pipeline: - plant_matches = ( + plants_matches = ( pipeline | 'Garden plants' >> beam.Create([ - '🍓 - Strawberry - perennial', - '🥕 - Carrot - biennial', - '# 🍌 - invalid - format', - '🍆\t-\tEggplant\t-\tperennial', - '🍅 - Tomato - annual', - '🍉 - invalid - format with trailing words', - '🥔-Potato-perennial', + '🍓, Strawberry, perennial', + '🥕, Carrot, biennial ignoring trailing words', + '🍆, Eggplant, perennial', + '🍅, Tomato, annual', + '🥔,Potato,perennial', + '# 🍌, invalid, format', + 'invalid, 🍉, format', ]) - | 'Parse plants' >> beam.FlatMap(parse_plant) + | 'Parse plants' >> beam.Regex.matches(regex) | beam.Map(print) ) - # [END regex_match] + # [END regex_matches] if test: - test(plant_matches) + test(plants_matches) -def regex_search(test=None): - # [START regex_search] +def regex_all_matches(test=None): + # [START regex_all_matches] import apache_beam as beam - import re - - def parse_plant_duration(text): - m = re.search(r'([^\s-]+)\s*-\s*(\w*)\s*-\s*(?P<duration>\w+)', text) - if m: - yield { - 'match': m.group(0), # contains the entire matched text - 'icon': m.group(1), # ([^\s-]+) - group - 'name': m.group(2), # (\w+) - group - 'duration': m.group('duration'), # (?P<duration>\w+) - named group - } + # Matches a named group 'icon', and then two comma-separated groups. + regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)' with beam.Pipeline() as pipeline: - plant_matches = ( + plants_all_matches = ( pipeline | 'Garden plants' >> beam.Create([ - '# 🍓 - Strawberry - perennial', - '# 🥕 - Carrot - biennial', - '# 🍆\t-\tEggplant\t-\tperennial', - '# 🍅 - Tomato - annual', - '# 🥔-Potato-perennial', + '🍓, Strawberry, perennial', + '🥕, Carrot, biennial ignoring trailing words', + '🍆, Eggplant, perennial', + '🍅, Tomato, annual', + '🥔,Potato,perennial', + '# 🍌, invalid, format', + 'invalid, 🍉, format', ]) - | 'Parse plants' >> beam.FlatMap(parse_plant_duration) + | 'Parse plants' >> beam.Regex.all_matches(regex) | beam.Map(print) ) - # [END regex_search] + # [END regex_all_matches] if test: - test(plant_matches) + test(plants_all_matches) + + +def regex_matches_kv(test=None): + # [START regex_matches_kv] + import apache_beam as beam + + # Matches a named group 'icon', and then two comma-separated groups. + regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)' + with beam.Pipeline() as pipeline: + plants_matches_kv = ( + pipeline + | 'Garden plants' >> beam.Create([ + '🍓, Strawberry, perennial', + '🥕, Carrot, biennial ignoring trailing words', + '🍆, Eggplant, perennial', + '🍅, Tomato, annual', + '🥔,Potato,perennial', + '# 🍌, invalid, format', + 'invalid, 🍉, format', + ]) + | 'Parse plants' >> beam.Regex.matches_kv(regex, keyGroup='icon') + | beam.Map(print) + ) + # [END regex_matches_kv] + if test: + test(plants_matches_kv) + + +def regex_find(test=None): + # [START regex_find] + import apache_beam as beam + + # Matches a named group 'icon', and then two comma-separated groups. + regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)' + with beam.Pipeline() as pipeline: + plants_matches = ( + pipeline + | 'Garden plants' >> beam.Create([ + '# 🍓, Strawberry, perennial', + '# 🥕, Carrot, biennial ignoring trailing words', + '# 🍆, Eggplant, perennial - 🍌, Banana, perennial', + '# 🍅, Tomato, annual - 🍉, Watermelon, annual', + '# 🥔,Potato,perennial', + ]) + | 'Parse plants' >> beam.Regex.find(regex) + | beam.Map(print) + ) + # [END regex_find] + if test: + test(plants_matches) def regex_find_all(test=None): # [START regex_find_all] import apache_beam as beam - import re - - def parse_words(text): - for m in re.finditer(r'[^\s-]+', text): - yield m.group() + # Matches a named group 'icon', and then two comma-separated groups. + regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)' with beam.Pipeline() as pipeline: - words = ( + plants_find_all = ( pipeline | 'Garden plants' >> beam.Create([ - '🍓 - Strawberry - perennial', - '🥕 - Carrot - biennial', - '🍆\t-\tEggplant\t-\tperennial', - '🍅 - Tomato - annual', - '🥔-Potato-perennial', + '# 🍓, Strawberry, perennial', + '# 🥕, Carrot, biennial ignoring trailing words', + '# 🍆, Eggplant, perennial - 🍌, Banana, perennial', + '# 🍅, Tomato, annual - 🍉, Watermelon, annual', + '# 🥔,Potato,perennial', ]) - | 'Parse words' >> beam.FlatMap(parse_words) + | 'Parse plants' >> beam.Regex.find_all(regex) | beam.Map(print) ) # [END regex_find_all] if test: - test(words) + test(plants_find_all) + + +def regex_find_kv(test=None): + # [START regex_find_kv] + import apache_beam as beam + + # Matches a named group 'icon', and then two comma-separated groups. + regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)' + with beam.Pipeline() as pipeline: + plants_matches_kv = ( + pipeline + | 'Garden plants' >> beam.Create([ + '# 🍓, Strawberry, perennial', + '# 🥕, Carrot, biennial ignoring trailing words', + '# 🍆, Eggplant, perennial - 🍌, Banana, perennial', + '# 🍅, Tomato, annual - 🍉, Watermelon, annual', + '# 🥔,Potato,perennial', + ]) + | 'Parse plants' >> beam.Regex.find_kv(regex, keyGroup='icon') + | beam.Map(print) + ) + # [END regex_find_kv] + if test: + test(plants_matches_kv) + + +def regex_replace_all(test=None): + # [START regex_replace_all] + import apache_beam as beam + + with beam.Pipeline() as pipeline: + plants_replace_all = ( + pipeline + | 'Garden plants' >> beam.Create([ + '🍓 : Strawberry : perennial', + '🥕 : Carrot : biennial', + '🍆\t\t:\t\tEggplant\t\t:\t\tperennial', + '🍅 : Tomato : annual', + '🥔:Potato:perennial', + ]) + | 'To CSV' >> beam.Regex.replace_all(r'\s*:\s*', ',') + | beam.Map(print) + ) + # [END regex_replace_all] + if test: + test(plants_replace_all) -def regex_replace(test=None): - # [START regex_replace] +def regex_replace_first(test=None): + # [START regex_replace_first] import apache_beam as beam - import re with beam.Pipeline() as pipeline: - plants_csv = ( + plants_replace_first = ( pipeline | 'Garden plants' >> beam.Create([ - '🍓 - Strawberry - perennial', - '🥕 - Carrot - biennial', - '🍆\t-\tEggplant\t-\tperennial', - '🍅 - Tomato - annual', - '🥔-Potato-perennial', + '🍓 , Strawberry, perennial', + '🥕, Carrot, biennial', + '🍆\t\t,\t\tEggplant, perennial', + '🍅, Tomato, annual', + '🥔,Potato, perennial', ]) - | 'To CSV' >> beam.Map(lambda text: re.sub(r'\s*-\s*', ',', text)) + | 'As dictionary' >> beam.Regex.replace_first(r'\s*,\s*', ': ') | beam.Map(print) ) - # [END regex_replace] + # [END regex_replace_first] if test: - test(plants_csv) + test(plants_replace_first) def regex_split(test=None): # [START regex_split] import apache_beam as beam - import re with beam.Pipeline() as pipeline: - plants_columns = ( + plants_split = ( pipeline | 'Garden plants' >> beam.Create([ - '🍓 - Strawberry - perennial', - '🥕 - Carrot - biennial', - '🍆\t-\tEggplant\t-\tperennial', - '🍅 - Tomato - annual', - '🥔-Potato-perennial', + '🍓 : Strawberry : perennial', + '🥕 : Carrot : biennial', + '🍆\t\t:\t\tEggplant : perennial', + '🍅 : Tomato : annual', + '🥔:Potato:perennial', ]) - | 'Split' >> beam.Map(lambda text: re.split(r'\s*-\s*', text)) + | 'Parse plants' >> beam.Regex.split(r'\s*:\s*') | beam.Map(print) ) # [END regex_split] if test: - test(plants_columns) + test(plants_split) diff --git a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py index 27c9524..df4fc39 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py @@ -23,92 +23,150 @@ import unittest import mock -from apache_beam.examples.snippets.transforms.element_wise.regex import * from apache_beam.testing.test_pipeline import TestPipeline from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to +from . import regex + + +def check_matches(actual): + # [START plants_matches] + plants_matches = [ + '🍓, Strawberry, perennial', + '🥕, Carrot, biennial', + '🍆, Eggplant, perennial', + '🍅, Tomato, annual', + '🥔,Potato,perennial', + ] + # [END plants_matches] + assert_that(actual, equal_to(plants_matches)) + + +def check_all_matches(actual): + # [START plants_all_matches] + plants_all_matches = [ + ['🍓, Strawberry, perennial', '🍓', 'Strawberry', 'perennial'], + ['🥕, Carrot, biennial', '🥕', 'Carrot', 'biennial'], + ['🍆, Eggplant, perennial', '🍆', 'Eggplant', 'perennial'], + ['🍅, Tomato, annual', '🍅', 'Tomato', 'annual'], + ['🥔,Potato,perennial', '🥔', 'Potato', 'perennial'], + ] + # [END plants_all_matches] + assert_that(actual, equal_to(plants_all_matches)) + + +def check_matches_kv(actual): + # [START plants_matches_kv] + plants_matches_kv = [ + ('🍓', '🍓, Strawberry, perennial'), + ('🥕', '🥕, Carrot, biennial'), + ('🍆', '🍆, Eggplant, perennial'), + ('🍅', '🍅, Tomato, annual'), + ('🥔', '🥔,Potato,perennial'), + ] + # [END plants_matches_kv] + assert_that(actual, equal_to(plants_matches_kv)) + + +def check_find_all(actual): + # [START plants_find_all] + plants_find_all = [ + ['🍓, Strawberry, perennial'], + ['🥕, Carrot, biennial'], + ['🍆, Eggplant, perennial', '🍌, Banana, perennial'], + ['🍅, Tomato, annual', '🍉, Watermelon, annual'], + ['🥔,Potato,perennial'], + ] + # [END plants_find_all] + assert_that(actual, equal_to(plants_find_all)) + + +def check_find_kv(actual): + # [START plants_find_kv] + plants_find_all = [ + ('🍓', '🍓, Strawberry, perennial'), + ('🥕', '🥕, Carrot, biennial'), + ('🍆', '🍆, Eggplant, perennial'), + ('🍌', '🍌, Banana, perennial'), + ('🍅', '🍅, Tomato, annual'), + ('🍉', '🍉, Watermelon, annual'), + ('🥔', '🥔,Potato,perennial'), + ] + # [END plants_find_kv] + assert_that(actual, equal_to(plants_find_all)) + + +def check_replace_all(actual): + # [START plants_replace_all] + plants_replace_all = [ + '🍓,Strawberry,perennial', + '🥕,Carrot,biennial', + '🍆,Eggplant,perennial', + '🍅,Tomato,annual', + '🥔,Potato,perennial', + ] + # [END plants_replace_all] + assert_that(actual, equal_to(plants_replace_all)) + + +def check_replace_first(actual): + # [START plants_replace_first] + plants_replace_first = [ + '🍓: Strawberry, perennial', + '🥕: Carrot, biennial', + '🍆: Eggplant, perennial', + '🍅: Tomato, annual', + '🥔: Potato, perennial', + ] + # [END plants_replace_first] + assert_that(actual, equal_to(plants_replace_first)) + + +def check_split(actual): + # [START plants_split] + plants_split = [ + ['🍓', 'Strawberry', 'perennial'], + ['🥕', 'Carrot', 'biennial'], + ['🍆', 'Eggplant', 'perennial'], + ['🍅', 'Tomato', 'annual'], + ['🥔', 'Potato', 'perennial'], + ] + # [END plants_replace_first] + assert_that(actual, equal_to(plants_split)) + @mock.patch('apache_beam.Pipeline', TestPipeline) # pylint: disable=line-too-long @mock.patch('apache_beam.examples.snippets.transforms.element_wise.regex.print', lambda elem: elem) # pylint: enable=line-too-long class RegexTest(unittest.TestCase): - def __init__(self, methodName): - super(RegexTest, self).__init__(methodName) - # pylint: disable=line-too-long - # [START plant_matches] - plant_matches = [ - {'match': '🍓 - Strawberry - perennial', 'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'}, - {'match': '🥕 - Carrot - biennial', 'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'}, - {'match': '🍆\t-\tEggplant\t-\tperennial', 'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'}, - {'match': '🍅 - Tomato - annual', 'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'}, - {'match': '🥔-Potato-perennial', 'icon': '🥔', 'name': 'Potato', 'duration': 'perennial'}, - ] - # [END plant_matches] - # pylint: enable=line-too-long - self.plant_matches_test = lambda actual: \ - assert_that(actual, equal_to(plant_matches)) - - # [START words] - words = [ - '🍓', - 'Strawberry', - 'perennial', - '🥕', - 'Carrot', - 'biennial', - '🍆', - 'Eggplant', - 'perennial', - '🍅', - 'Tomato', - 'annual', - '🥔', - 'Potato', - 'perennial', - ] - # [END words] - self.words_test = lambda actual: assert_that(actual, equal_to(words)) - - # [START plants_csv] - plants_csv = [ - '🍓,Strawberry,perennial', - '🥕,Carrot,biennial', - '🍆,Eggplant,perennial', - '🍅,Tomato,annual', - '🥔,Potato,perennial', - ] - # [END plants_csv] - self.plants_csv_test = lambda actual: \ - assert_that(actual, equal_to(plants_csv)) - - # [START plants_columns] - plants_columns = [ - ['🍓', 'Strawberry', 'perennial'], - ['🥕', 'Carrot', 'biennial'], - ['🍆', 'Eggplant', 'perennial'], - ['🍅', 'Tomato', 'annual'], - ['🥔', 'Potato', 'perennial'], - ] - # [END plants_columns] - self.plants_columns_test = lambda actual: \ - assert_that(actual, equal_to(plants_columns)) - - def test_regex_match(self): - regex_match(self.plant_matches_test) - - def test_regex_search(self): - regex_search(self.plant_matches_test) - - def test_regex_find_all(self): - regex_find_all(self.words_test) - - def test_regex_replace(self): - regex_replace(self.plants_csv_test) - - def test_regex_split(self): - regex_split(self.plants_columns_test) + def test_matches(self): + regex.regex_matches(check_matches) + + def test_all_matches(self): + regex.regex_all_matches(check_all_matches) + + def test_matches_kv(self): + regex.regex_matches_kv(check_matches_kv) + + def test_find(self): + regex.regex_find(check_matches) + + def test_find_all(self): + regex.regex_find_all(check_find_all) + + def test_find_kv(self): + regex.regex_find_kv(check_find_kv) + + def test_replace_all(self): + regex.regex_replace_all(check_replace_all) + + def test_replace_first(self): + regex.regex_replace_first(check_replace_first) + + def test_split(self): + regex.regex_split(check_split) if __name__ == '__main__':