This is an automated email from the ASF dual-hosted git repository. nkak pushed a commit to branch madlib2-master in repository https://gitbox.apache.org/repos/asf/madlib.git
commit b944045e624e791b6c41bca4ef5d56ba54d4bb68 Author: Nikhil Kak <n...@vmware.com> AuthorDate: Tue Feb 20 15:06:46 2024 -0800 PMML: Consider spaces when parsing the indep var JIRA: MADLIB-1517 A previous commit 0cd28f9733927d63beaefc9488db7f8bfdb3bd80 added support to parse the independent var expression to determine if an intercept was used during training. This commit improves the regex by adding support for spaces and also adds a detailed explanation for the regex This commit also fixes a warning that would get generated with the previous regex: ``` re.compile(r'array[[]([0-1],|[0-1].0,)?(["a-z0-9_, .]+)[]]', flags=re.I) <stdin>:1: FutureWarning: Possible nested set at position 6 ``` --- src/ports/postgres/modules/pmml/formula.py_in | 16 +- .../pmml/test/unit_tests/test_formula.py_in | 177 +++++++++++++++++++++ 2 files changed, 191 insertions(+), 2 deletions(-) diff --git a/src/ports/postgres/modules/pmml/formula.py_in b/src/ports/postgres/modules/pmml/formula.py_in index 0d575315..5f97bb51 100644 --- a/src/ports/postgres/modules/pmml/formula.py_in +++ b/src/ports/postgres/modules/pmml/formula.py_in @@ -12,8 +12,20 @@ class Formula(object): :param coef_len: Length of all the coefficients including the intercept's coefficient(if any) """ - # TODO: Fix the nested warning and add explanation for the regex - self.array_expr = re.compile(r'array[[]([0-1],|[0-1].0,)?(["a-z0-9_, .]+)[]]', flags=re.I) + + self.array_expr = re.compile(r'array\[(\s*?[0-1]\s*?,\s*?|\s*?[0-1].0\s*?,\s*?)?(["a-z0-9_, .]+)]', + flags=re.I) + # Regex explanation: + # array\[ matches array[ or ARRAY[ + # \s*? matches 0 or more spaces + # | represents an OR + # [0-1]\s*?, matches either "1," or "0," including spaces + # [0-1].0\s*?, matches either "1.0," or "0.0," including spaces + # [0-1]\s*?,\s*?|\s*?[0-1].0\s*?, matches either "1", "0", "1.0", or "0.0" including spaces + # ()? captures the output of that group. ? means it's optional + # That's why we use ()? for the first capture group i.e "1,", "0,", "1.0," or "0.0," + # (["a-z0-9_, .]+) matches any occurrences of these characters and captures the output in a group + self.non_array_expr = re.compile(r'["a-z0-9_]+', flags=re.I) self.intercept = self.has_intercept(x_str) diff --git a/src/ports/postgres/modules/pmml/test/unit_tests/test_formula.py_in b/src/ports/postgres/modules/pmml/test/unit_tests/test_formula.py_in index 6075edc4..2ce7b8ae 100644 --- a/src/ports/postgres/modules/pmml/test/unit_tests/test_formula.py_in +++ b/src/ports/postgres/modules/pmml/test/unit_tests/test_formula.py_in @@ -138,11 +138,22 @@ class FormulaTestCase(unittest.TestCase): self.assertEqual(f.intercept, True) def test_formula_array_with_invalid_intercept(self): + f = self.subject.Formula('baaz', 'ARRAY[0.1,foo,bar]', 3) + self.assertEqual(f.x, ['0.1', 'foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, False) + + f = self.subject.Formula('baaz', 'ARRAY[10,foo,bar]', 3) self.assertEqual(f.x, ['10', 'foo', 'bar']) self.assertEqual(f.y, "baaz") self.assertEqual(f.intercept, False) + f = self.subject.Formula('baaz', 'ARRAY[ 10 , foo,bar]', 3) + self.assertEqual(f.x, ['10', 'foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, False) + # A negative number shouldn't be allowed technically the train functions # don't error out, so adding this test for the sake of completeness f = self.subject.Formula('baaz', 'ARRAY[-2,foo,bar]', 3) @@ -203,6 +214,172 @@ class FormulaTestCase(unittest.TestCase): self.assertEqual(f.y, "baaz") self.assertEqual(f.intercept, False) + def test_formula_array_with_spaces_with_intercept(self): + f = self.subject.Formula('baaz', 'ARRAY[1 ,foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[1 ,"1",bar]', 3) + self.assertEqual(f.x, ['1', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[ 1,foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[ 1,foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + + f = self.subject.Formula('baaz', 'ARRAY[ 1 , foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[1, "1",bar]', 3) + self.assertEqual(f.x, ['1', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[ 1, foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[ 1 , foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[ 1 , foo , bar ]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[1 ,foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[1 ,"1",bar]', 3) + self.assertEqual(f.x, ['1', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[1.0 ,foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[1.0 ,"1",bar]', 3) + self.assertEqual(f.x, ['1', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[ 1.0,foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[ 1.0,foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + + f = self.subject.Formula('baaz', 'ARRAY[ 1.0 , foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[1.0, "1.0",bar]', 3) + self.assertEqual(f.x, ['1.0', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[ 1.0, foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[ 1.0 , foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[ 1.0 , foo , bar ]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[1.0 ,foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[1.0 ,"1",bar]', 3) + self.assertEqual(f.x, ['1', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[ 0,foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[ 0,foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + + f = self.subject.Formula('baaz', 'ARRAY[ 0 ,foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[0, "1",bar]', 3) + self.assertEqual(f.x, ['1', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[ 0, foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[ 0 , foo,bar]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + f = self.subject.Formula('baaz', 'ARRAY[ 0 , foo , bar ]', 3) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, True) + + def test_formula_array_with_spaces_without_intercept(self): + f = self.subject.Formula('baaz', 'ARRAY[ foo,bar]', 2) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, False) + + f = self.subject.Formula('baaz', 'ARRAY[ foo , bar ]', 2) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, False) + + f = self.subject.Formula('baaz', 'ARRAY[foo ,bar]', 2) + self.assertEqual(f.x, ['foo', 'bar']) + self.assertEqual(f.y, "baaz") + self.assertEqual(f.intercept, False) + + def test_formula_nonarray(self): f = self.subject.Formula('baaz', 'foo', 3) self.assertEqual(f.x, ['foo[1]', 'foo[2]'])