This is an automated email from the ASF dual-hosted git repository.
altay pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push:
new f745702 Add Python snippet for Regex transform
new db59a3d Merge pull request #8905 from davidcavazos/element-wise-regex
f745702 is described below
commit f7457028dae67cd94003ef4405dae85c3c21b4c0
Author: David Cavazos <[email protected]>
AuthorDate: Mon Jun 10 16:25:33 2019 -0700
Add Python snippet for Regex transform
---
.../snippets/transforms/element_wise/regex.py | 161 +++++++++++++++++++++
.../snippets/transforms/element_wise/regex_test.py | 115 +++++++++++++++
2 files changed, 276 insertions(+)
diff --git
a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py
b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py
new file mode 100644
index 0000000..44aa9629
--- /dev/null
+++ b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py
@@ -0,0 +1,161 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+
+def regex_match(test=None):
+ # [START regex_match]
+ import apache_beam as beam
+ import re
+
+ def parse_plant(text):
+ m = re.match(r'^([^\s-]+)\s*-\s*(\w+)\s*-\s*(?P<duration>\w+)$', text)
+ if m:
+ yield {
+ 'match': m.group(0), # contains the entire matched text
+ 'icon': m.group(1), # ([^\s-]+) - group
+ 'name': m.group(2), # (\w+) - group
+ 'duration': m.group('duration'), # (?P<duration>\w+) - named group
+ }
+
+ with beam.Pipeline() as pipeline:
+ plant_matches = (
+ pipeline
+ | 'Garden plants' >> beam.Create([
+ '🍓 - Strawberry - perennial',
+ '🥕 - Carrot - biennial',
+ '# 🍌 - invalid - format',
+ '🍆\t-\tEggplant\t-\tperennial',
+ '🍅 - Tomato - annual',
+ '🍉 - invalid - format with trailing words',
+ '🥔-Potato-perennial',
+ ])
+ | 'Parse plants' >> beam.FlatMap(parse_plant)
+ | beam.Map(print)
+ )
+ # [END regex_match]
+ if test:
+ test(plant_matches)
+
+
+def regex_search(test=None):
+ # [START regex_search]
+ import apache_beam as beam
+ import re
+
+ def parse_plant_duration(text):
+ m = re.search(r'([^\s-]+)\s*-\s*(\w*)\s*-\s*(?P<duration>\w+)', text)
+ if m:
+ yield {
+ 'match': m.group(0), # contains the entire matched text
+ 'icon': m.group(1), # ([^\s-]+) - group
+ 'name': m.group(2), # (\w+) - group
+ 'duration': m.group('duration'), # (?P<duration>\w+) - named group
+ }
+
+ with beam.Pipeline() as pipeline:
+ plant_matches = (
+ pipeline
+ | 'Garden plants' >> beam.Create([
+ '# 🍓 - Strawberry - perennial',
+ '# 🥕 - Carrot - biennial',
+ '# 🍆\t-\tEggplant\t-\tperennial',
+ '# 🍅 - Tomato - annual',
+ '# 🥔-Potato-perennial',
+ ])
+ | 'Parse plants' >> beam.FlatMap(parse_plant_duration)
+ | beam.Map(print)
+ )
+ # [END regex_search]
+ if test:
+ test(plant_matches)
+
+
+def regex_find_all(test=None):
+ # [START regex_find_all]
+ import apache_beam as beam
+ import re
+
+ def parse_words(text):
+ for m in re.finditer(r'[^\s-]+', text):
+ yield m.group()
+
+ with beam.Pipeline() as pipeline:
+ words = (
+ pipeline
+ | 'Garden plants' >> beam.Create([
+ '🍓 - Strawberry - perennial',
+ '🥕 - Carrot - biennial',
+ '🍆\t-\tEggplant\t-\tperennial',
+ '🍅 - Tomato - annual',
+ '🥔-Potato-perennial',
+ ])
+ | 'Parse words' >> beam.FlatMap(parse_words)
+ | beam.Map(print)
+ )
+ # [END regex_find_all]
+ if test:
+ test(words)
+
+
+def regex_replace(test=None):
+ # [START regex_replace]
+ import apache_beam as beam
+ import re
+
+ with beam.Pipeline() as pipeline:
+ plants_csv = (
+ pipeline
+ | 'Garden plants' >> beam.Create([
+ '🍓 - Strawberry - perennial',
+ '🥕 - Carrot - biennial',
+ '🍆\t-\tEggplant\t-\tperennial',
+ '🍅 - Tomato - annual',
+ '🥔-Potato-perennial',
+ ])
+ | 'To CSV' >> beam.Map(lambda text: re.sub(r'\s*-\s*', ',', text))
+ | beam.Map(print)
+ )
+ # [END regex_replace]
+ if test:
+ test(plants_csv)
+
+
+def regex_split(test=None):
+ # [START regex_split]
+ import apache_beam as beam
+ import re
+
+ with beam.Pipeline() as pipeline:
+ plants_columns = (
+ pipeline
+ | 'Garden plants' >> beam.Create([
+ '🍓 - Strawberry - perennial',
+ '🥕 - Carrot - biennial',
+ '🍆\t-\tEggplant\t-\tperennial',
+ '🍅 - Tomato - annual',
+ '🥔-Potato-perennial',
+ ])
+ | 'Split' >> beam.Map(lambda text: re.split(r'\s*-\s*', text))
+ | beam.Map(print)
+ )
+ # [END regex_split]
+ if test:
+ test(plants_columns)
diff --git
a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py
b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py
new file mode 100644
index 0000000..27c9524
--- /dev/null
+++
b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py
@@ -0,0 +1,115 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+import unittest
+
+import mock
+
+from apache_beam.examples.snippets.transforms.element_wise.regex import *
+from apache_beam.testing.test_pipeline import TestPipeline
+from apache_beam.testing.util import assert_that
+from apache_beam.testing.util import equal_to
+
+
[email protected]('apache_beam.Pipeline', TestPipeline)
+# pylint: disable=line-too-long
[email protected]('apache_beam.examples.snippets.transforms.element_wise.regex.print',
lambda elem: elem)
+# pylint: enable=line-too-long
+class RegexTest(unittest.TestCase):
+ def __init__(self, methodName):
+ super(RegexTest, self).__init__(methodName)
+ # pylint: disable=line-too-long
+ # [START plant_matches]
+ plant_matches = [
+ {'match': '🍓 - Strawberry - perennial', 'icon': '🍓', 'name':
'Strawberry', 'duration': 'perennial'},
+ {'match': '🥕 - Carrot - biennial', 'icon': '🥕', 'name': 'Carrot',
'duration': 'biennial'},
+ {'match': '🍆\t-\tEggplant\t-\tperennial', 'icon': '🍆', 'name':
'Eggplant', 'duration': 'perennial'},
+ {'match': '🍅 - Tomato - annual', 'icon': '🍅', 'name': 'Tomato',
'duration': 'annual'},
+ {'match': '🥔-Potato-perennial', 'icon': '🥔', 'name': 'Potato',
'duration': 'perennial'},
+ ]
+ # [END plant_matches]
+ # pylint: enable=line-too-long
+ self.plant_matches_test = lambda actual: \
+ assert_that(actual, equal_to(plant_matches))
+
+ # [START words]
+ words = [
+ '🍓',
+ 'Strawberry',
+ 'perennial',
+ '🥕',
+ 'Carrot',
+ 'biennial',
+ '🍆',
+ 'Eggplant',
+ 'perennial',
+ '🍅',
+ 'Tomato',
+ 'annual',
+ '🥔',
+ 'Potato',
+ 'perennial',
+ ]
+ # [END words]
+ self.words_test = lambda actual: assert_that(actual, equal_to(words))
+
+ # [START plants_csv]
+ plants_csv = [
+ '🍓,Strawberry,perennial',
+ '🥕,Carrot,biennial',
+ '🍆,Eggplant,perennial',
+ '🍅,Tomato,annual',
+ '🥔,Potato,perennial',
+ ]
+ # [END plants_csv]
+ self.plants_csv_test = lambda actual: \
+ assert_that(actual, equal_to(plants_csv))
+
+ # [START plants_columns]
+ plants_columns = [
+ ['🍓', 'Strawberry', 'perennial'],
+ ['🥕', 'Carrot', 'biennial'],
+ ['🍆', 'Eggplant', 'perennial'],
+ ['🍅', 'Tomato', 'annual'],
+ ['🥔', 'Potato', 'perennial'],
+ ]
+ # [END plants_columns]
+ self.plants_columns_test = lambda actual: \
+ assert_that(actual, equal_to(plants_columns))
+
+ def test_regex_match(self):
+ regex_match(self.plant_matches_test)
+
+ def test_regex_search(self):
+ regex_search(self.plant_matches_test)
+
+ def test_regex_find_all(self):
+ regex_find_all(self.words_test)
+
+ def test_regex_replace(self):
+ regex_replace(self.plants_csv_test)
+
+ def test_regex_split(self):
+ regex_split(self.plants_columns_test)
+
+
+if __name__ == '__main__':
+ unittest.main()