Lee-W commented on code in PR #64571:
URL: https://github.com/apache/airflow/pull/64571#discussion_r3309887735
##########
airflow-core/src/airflow/partition_mappers/temporal.py:
##########
@@ -27,7 +28,124 @@
from pendulum import FixedTimezone, Timezone
-class _BaseTemporalMapper(PartitionMapper, ABC):
+_STRPTIME_PATTERNS: dict[str, str] = {
+ "%Y": r"\d{4}",
+ "%m": r"\d{2}",
+ "%d": r"\d{2}",
+ "%H": r"\d{2}",
+ "%M": r"\d{2}",
+ "%S": r"\d{2}",
+ "%V": r"\d{2}",
+ "%U": r"\d{2}",
+ "%W": r"\d{2}",
+}
+
+
+def _compile_output_format_regex(
+ fmt: str, placeholder_patterns: dict[str, str] | None = None
+) -> re.Pattern[str]:
+ r"""
+ Compile *fmt* into a regex with named groups so a formatted key can be
parsed back.
+
+ Walks *fmt* left-to-right, classifying each position into one of three
+ branches:
+
+ 1. A two-character strftime directive listed in ``_STRPTIME_PATTERNS``
+ (``%Y``, ``%m``, ``%V`` …) becomes a named group keyed on the directive
+ letter — e.g. ``%Y`` → ``(?P<Y>\d{4})``. The ``i + 1 < len(fmt)`` guard
+ lets a trailing bare ``%`` fall through to the literal branch (escaped
+ as ``\%``), and unrecognised directives like ``%X`` likewise fall
+ through (matched literally; the format will not round-trip through
+ strftime output that uses them).
+ 2. A FastAPI-style ``{name}`` placeholder becomes a named group keyed on
+ *name*. The pattern defaults to ``\w+``; pass *placeholder_patterns*
+ to narrow it (e.g. ``{"quarter": r"[1-4]"}``). A ``{`` with no
+ matching ``}`` falls through to the literal branch (so a stray ``{``
+ is escaped, not raised).
+ 3. Anything else is escaped via :func:`re.escape` and emitted verbatim,
+ so separators, literal hyphens, ``Q``/``W`` prefix letters, and so on
+ all participate in the match.
+
+ The compiled regex is consumed via ``.search`` (not ``.fullmatch``) so the
+ caller does not need to anchor on the literal prefix/suffix length.
+
+ Raises :exc:`ValueError` at compile time when *fmt* is structurally
+ malformed — empty ``{}``, a placeholder name that is not a valid Python
+ identifier, the same strftime directive used twice, the same
+ placeholder used twice, or two adjacent default-pattern placeholders
+ (the greedy ``\w+`` default would consume the previous group's tail,
+ yielding a parse the caller almost never intends — insert a separator
+ or narrow at least one placeholder via *placeholder_patterns*).
+ Catching these at construction keeps a misconfigured mapper from
+ surfacing as an opaque ``re.error`` deep inside the scheduler tick or
+ a UI route.
+
+ Known limitation: locale-sensitive strftime directives (``%A``, ``%a``,
+ ``%B``, ``%b``, ``%p``) are not in ``_STRPTIME_PATTERNS`` and are
+ treated as literals; formats relying on them are not invertible by
+ this helper.
+ """
+ placeholder_patterns = placeholder_patterns or {}
+ parts: list[str] = []
+ seen_groups: set[str] = set()
+ prev_was_default_placeholder = False
+ i = 0
+ while i < len(fmt):
+ if fmt[i] == "%" and i + 1 < len(fmt) and fmt[i : i + 2] in
_STRPTIME_PATTERNS:
+ directive = fmt[i : i + 2]
+ name = directive[1]
+ if name in seen_groups:
+ raise ValueError(
+ f"output_format {fmt!r} reuses directive {directive!r}; "
+ "each strftime directive may appear at most once."
+ )
+ seen_groups.add(name)
+ parts.append(f"(?P<{name}>{_STRPTIME_PATTERNS[directive]})")
+ i += 2
+ prev_was_default_placeholder = False
+ continue
+ if fmt[i] == "{":
+ end = fmt.find("}", i)
+ if end != -1:
+ name = fmt[i + 1 : end]
+ if not name.isidentifier():
+ raise ValueError(
+ f"output_format {fmt!r} has invalid placeholder
{{{name}}}; "
+ "placeholder names must be valid Python identifiers."
+ )
+ if name in seen_groups:
+ raise ValueError(
+ f"output_format {fmt!r} reuses placeholder {{{name}}};
"
+ "each placeholder may appear at most once."
+ )
+ seen_groups.add(name)
+ is_default = name not in placeholder_patterns
+ if is_default and prev_was_default_placeholder:
+ # Two adjacent default-pattern (``\w+``) placeholders.
+ # The greedy first group consumes everything except the
+ # minimum length the second needs, producing a parse the
+ # caller almost never intends (e.g. ``{a}{b}`` on "foo123"
+ # yields a="foo12", b="3"). Either insert a literal
+ # separator between them or narrow at least one pattern
+ # via *placeholder_patterns*.
+ raise ValueError(
+ f"output_format {fmt!r} has adjacent default-pattern
placeholders "
+ f"ending at {{{name}}}; the greedy ``\\w+`` default
would consume "
+ "the previous group's tail. Insert a separator between
the "
+ "placeholders, or narrow at least one of them via
placeholder_patterns."
+ )
+ pattern = placeholder_patterns.get(name, r"\w+")
+ parts.append(f"(?P<{name}>{pattern})")
+ i = end + 1
+ prev_was_default_placeholder = is_default
+ continue
+ parts.append(re.escape(fmt[i]))
+ i += 1
+ prev_was_default_placeholder = False
+ return re.compile("".join(parts))
Review Comment:
_compile_output_format_regex` now returns `re.compile(f"^{...}$")` so the
two `.search()` callers (`StartOfWeekMapper`, `StartOfQuarterMapper`) eject
malformed keys with trailing or leading garbage instead of silently matching a
substring.
##########
airflow-core/src/airflow/partition_mappers/temporal.py:
##########
@@ -27,7 +28,124 @@
from pendulum import FixedTimezone, Timezone
-class _BaseTemporalMapper(PartitionMapper, ABC):
+_STRPTIME_PATTERNS: dict[str, str] = {
+ "%Y": r"\d{4}",
+ "%m": r"\d{2}",
+ "%d": r"\d{2}",
+ "%H": r"\d{2}",
+ "%M": r"\d{2}",
+ "%S": r"\d{2}",
+ "%V": r"\d{2}",
+ "%U": r"\d{2}",
+ "%W": r"\d{2}",
+}
+
+
+def _compile_output_format_regex(
+ fmt: str, placeholder_patterns: dict[str, str] | None = None
+) -> re.Pattern[str]:
+ r"""
+ Compile *fmt* into a regex with named groups so a formatted key can be
parsed back.
+
+ Walks *fmt* left-to-right, classifying each position into one of three
+ branches:
+
+ 1. A two-character strftime directive listed in ``_STRPTIME_PATTERNS``
+ (``%Y``, ``%m``, ``%V`` …) becomes a named group keyed on the directive
+ letter — e.g. ``%Y`` → ``(?P<Y>\d{4})``. The ``i + 1 < len(fmt)`` guard
+ lets a trailing bare ``%`` fall through to the literal branch (escaped
+ as ``\%``), and unrecognised directives like ``%X`` likewise fall
+ through (matched literally; the format will not round-trip through
+ strftime output that uses them).
+ 2. A FastAPI-style ``{name}`` placeholder becomes a named group keyed on
+ *name*. The pattern defaults to ``\w+``; pass *placeholder_patterns*
+ to narrow it (e.g. ``{"quarter": r"[1-4]"}``). A ``{`` with no
+ matching ``}`` falls through to the literal branch (so a stray ``{``
+ is escaped, not raised).
+ 3. Anything else is escaped via :func:`re.escape` and emitted verbatim,
+ so separators, literal hyphens, ``Q``/``W`` prefix letters, and so on
+ all participate in the match.
+
+ The compiled regex is consumed via ``.search`` (not ``.fullmatch``) so the
+ caller does not need to anchor on the literal prefix/suffix length.
+
+ Raises :exc:`ValueError` at compile time when *fmt* is structurally
+ malformed — empty ``{}``, a placeholder name that is not a valid Python
+ identifier, the same strftime directive used twice, the same
+ placeholder used twice, or two adjacent default-pattern placeholders
+ (the greedy ``\w+`` default would consume the previous group's tail,
+ yielding a parse the caller almost never intends — insert a separator
+ or narrow at least one placeholder via *placeholder_patterns*).
+ Catching these at construction keeps a misconfigured mapper from
+ surfacing as an opaque ``re.error`` deep inside the scheduler tick or
+ a UI route.
+
+ Known limitation: locale-sensitive strftime directives (``%A``, ``%a``,
+ ``%B``, ``%b``, ``%p``) are not in ``_STRPTIME_PATTERNS`` and are
+ treated as literals; formats relying on them are not invertible by
+ this helper.
+ """
+ placeholder_patterns = placeholder_patterns or {}
+ parts: list[str] = []
+ seen_groups: set[str] = set()
+ prev_was_default_placeholder = False
+ i = 0
+ while i < len(fmt):
+ if fmt[i] == "%" and i + 1 < len(fmt) and fmt[i : i + 2] in
_STRPTIME_PATTERNS:
+ directive = fmt[i : i + 2]
+ name = directive[1]
+ if name in seen_groups:
+ raise ValueError(
+ f"output_format {fmt!r} reuses directive {directive!r}; "
+ "each strftime directive may appear at most once."
+ )
+ seen_groups.add(name)
+ parts.append(f"(?P<{name}>{_STRPTIME_PATTERNS[directive]})")
+ i += 2
+ prev_was_default_placeholder = False
+ continue
+ if fmt[i] == "{":
+ end = fmt.find("}", i)
+ if end != -1:
+ name = fmt[i + 1 : end]
+ if not name.isidentifier():
+ raise ValueError(
+ f"output_format {fmt!r} has invalid placeholder
{{{name}}}; "
+ "placeholder names must be valid Python identifiers."
+ )
+ if name in seen_groups:
+ raise ValueError(
+ f"output_format {fmt!r} reuses placeholder {{{name}}};
"
+ "each placeholder may appear at most once."
+ )
+ seen_groups.add(name)
+ is_default = name not in placeholder_patterns
+ if is_default and prev_was_default_placeholder:
+ # Two adjacent default-pattern (``\w+``) placeholders.
+ # The greedy first group consumes everything except the
+ # minimum length the second needs, producing a parse the
+ # caller almost never intends (e.g. ``{a}{b}`` on "foo123"
+ # yields a="foo12", b="3"). Either insert a literal
+ # separator between them or narrow at least one pattern
+ # via *placeholder_patterns*.
+ raise ValueError(
+ f"output_format {fmt!r} has adjacent default-pattern
placeholders "
+ f"ending at {{{name}}}; the greedy ``\\w+`` default
would consume "
+ "the previous group's tail. Insert a separator between
the "
+ "placeholders, or narrow at least one of them via
placeholder_patterns."
+ )
+ pattern = placeholder_patterns.get(name, r"\w+")
+ parts.append(f"(?P<{name}>{pattern})")
+ i = end + 1
+ prev_was_default_placeholder = is_default
+ continue
+ parts.append(re.escape(fmt[i]))
+ i += 1
+ prev_was_default_placeholder = False
+ return re.compile("".join(parts))
Review Comment:
`_compile_output_format_regex` now returns `re.compile(f"^{...}$")` so the
two `.search()` callers (`StartOfWeekMapper`, `StartOfQuarterMapper`) eject
malformed keys with trailing or leading garbage instead of silently matching a
substring.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]