cloud-fan commented on code in PR #56026:
URL: https://github.com/apache/spark/pull/56026#discussion_r3289110372
##########
dev/merge_spark_pr.py:
##########
@@ -700,77 +701,275 @@ def resolve_jira_issues(title, merge_branches, comment):
resolve_jira_issue(merge_branches, comment, jira_id)
-def standardize_jira_ref(text):
- """
- Standardize the [SPARK-XXXXX] [MODULE] prefix
- Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK
XXX [MLLIB]: Issue" to
- "[SPARK-XXX][MLLIB] Issue"
-
- >>> standardize_jira_ref(
- ... "[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete
is successful")
- '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is
successful'
- >>> standardize_jira_ref(
- ... "[SPARK-4123][Project Infra][WIP]: Show new dependencies added in
pull requests")
- '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull
requests'
- >>> standardize_jira_ref("[MLlib] Spark 5954: Top by key")
- '[SPARK-5954][MLLIB] Top by key'
- >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing
in TaskSchedulerImpl")
- '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl'
- >>> standardize_jira_ref(
- ... "SPARK-1094 Support MiMa for reporting binary compatibility across
versions.")
- '[SPARK-1094] Support MiMa for reporting binary compatibility across
versions.'
- >>> standardize_jira_ref("[WIP] [SPARK-1146] Vagrant support for Spark")
- '[SPARK-1146][WIP] Vagrant support for Spark'
- >>> standardize_jira_ref(
- ... "SPARK-1032. If Yarn app fails before registering, app master
stays aroun...")
- '[SPARK-1032] If Yarn app fails before registering, app master stays
aroun...'
- >>> standardize_jira_ref(
- ... "[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved
words in DDL parser.")
- '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in
DDL parser.'
- >>> standardize_jira_ref(
- ... 'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with
F.lit"')
- 'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with F.lit"'
- >>> standardize_jira_ref("Additional information for users building from
source code")
- 'Additional information for users building from source code'
+
+class Component:
+ """A Spark PR-title tag, paired with its canonical JIRA component name.
+
+ ``jira_name`` is the canonical name of the SPARK JIRA component (e.g.
+ "Documentation"); empty for status markers like [MINOR] that are not
+ JIRA components but are still recognized in PR titles.
+
+ ``tag`` is the preferred PR-title abbreviation (uppercase, no brackets,
+ e.g. "DOC"). ``aliases`` lists other accepted spellings that resolve to
+ the same component (e.g. "DOCS", "DOCUMENTATION" -> "DOC").
+
+ ``primary`` marks components whose presence alone satisfies the merge-time
+ requirement. Non-primary JIRA components (e.g. [TEST], [PS], [SHUFFLE])
Review Comment:
Stale example: `[PS]` was promoted to `primary=True` in the latest commit
(`Make PS primary; ...`). Replace with a still-non-primary tag so the docstring
stays accurate.
```suggestion
requirement. Non-primary JIRA components (e.g. [TEST], [SHUFFLE],
[DEPLOY])
```
##########
dev/merge_spark_pr.py:
##########
@@ -700,77 +701,275 @@ def resolve_jira_issues(title, merge_branches, comment):
resolve_jira_issue(merge_branches, comment, jira_id)
-def standardize_jira_ref(text):
- """
- Standardize the [SPARK-XXXXX] [MODULE] prefix
- Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK
XXX [MLLIB]: Issue" to
- "[SPARK-XXX][MLLIB] Issue"
-
- >>> standardize_jira_ref(
- ... "[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete
is successful")
- '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is
successful'
- >>> standardize_jira_ref(
- ... "[SPARK-4123][Project Infra][WIP]: Show new dependencies added in
pull requests")
- '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull
requests'
- >>> standardize_jira_ref("[MLlib] Spark 5954: Top by key")
- '[SPARK-5954][MLLIB] Top by key'
- >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing
in TaskSchedulerImpl")
- '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl'
- >>> standardize_jira_ref(
- ... "SPARK-1094 Support MiMa for reporting binary compatibility across
versions.")
- '[SPARK-1094] Support MiMa for reporting binary compatibility across
versions.'
- >>> standardize_jira_ref("[WIP] [SPARK-1146] Vagrant support for Spark")
- '[SPARK-1146][WIP] Vagrant support for Spark'
- >>> standardize_jira_ref(
- ... "SPARK-1032. If Yarn app fails before registering, app master
stays aroun...")
- '[SPARK-1032] If Yarn app fails before registering, app master stays
aroun...'
- >>> standardize_jira_ref(
- ... "[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved
words in DDL parser.")
- '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in
DDL parser.'
- >>> standardize_jira_ref(
- ... 'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with
F.lit"')
- 'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with F.lit"'
- >>> standardize_jira_ref("Additional information for users building from
source code")
- 'Additional information for users building from source code'
+
+class Component:
+ """A Spark PR-title tag, paired with its canonical JIRA component name.
+
+ ``jira_name`` is the canonical name of the SPARK JIRA component (e.g.
+ "Documentation"); empty for status markers like [MINOR] that are not
+ JIRA components but are still recognized in PR titles.
+
+ ``tag`` is the preferred PR-title abbreviation (uppercase, no brackets,
+ e.g. "DOC"). ``aliases`` lists other accepted spellings that resolve to
+ the same component (e.g. "DOCS", "DOCUMENTATION" -> "DOC").
+
+ ``primary`` marks components whose presence alone satisfies the merge-time
+ requirement. Non-primary JIRA components (e.g. [TEST], [PS], [SHUFFLE])
+ remain recognized — they normalize and pass through validation — but
+ they must be paired with a primary tag (e.g. [SQL][TEST]). Status
+ markers are never primary. [WIP] is intentionally absent from the
+ registry: a WIP PR should be aborted at the earlier WIP warning, not
+ merged.
"""
- jira_refs = []
- components = []
- # If this is a Revert PR, no need to process any further
- if text.startswith('Revert "') and text.endswith('"'):
- return text
+ def __init__(self, tag, aliases=(), primary=False, jira_name=""):
+ self.tag = tag
+ self.aliases = frozenset(aliases)
+ self.primary = primary
+ self.jira_name = jira_name
- # If the string is compliant, no need to process any further
- if re.search(r"^\[SPARK-[0-9]{3,6}\](\[[A-Z0-9_\s,]+\] )+\S+", text):
- return text
+ def matches(self, token):
+ return token == self.tag or token in self.aliases
- # Extract JIRA ref(s):
- pattern = re.compile(r"(SPARK[-\s]*[0-9]{3,6})+", re.IGNORECASE)
- for ref in pattern.findall(text):
- # Add brackets, replace spaces with a dash, & convert to uppercase
- jira_refs.append("[" + re.sub(r"\s+", "-", ref.upper()) + "]")
- text = text.replace(ref, "")
+ @classmethod
+ def find(cls, token):
+ """Return the Component matching ``token`` (case-insensitive), or
None."""
+ if token is None:
+ return None
+ token = token.strip().upper()
+ for c in COMPONENTS:
+ if c.matches(token):
+ return c
+ return None
- # Extract spark component(s):
- # Look for alphanumeric chars, spaces, dashes, periods, and/or commas
- pattern = re.compile(r"(\[[\w\s,.-]+\])", re.IGNORECASE)
- for component in pattern.findall(text):
- components.append(component.upper())
- text = text.replace(component, "")
- # Cleanup any remaining symbols:
- pattern = re.compile(r"^\W+(.*)", re.IGNORECASE)
- if pattern.search(text) is not None:
- text = pattern.search(text).groups()[0]
- # Assemble full text (JIRA ref(s), module(s), remaining text)
- clean_text = "".join(jira_refs).strip() + "".join(components).strip() + "
" + text.strip()
+# Full SPARK JIRA component list (sorted alphabetically by tag), followed
+# by status markers. Keep in sync with the components in JIRA — fetch the
+# current list with:
+# curl -s https://issues.apache.org/jira/rest/api/2/project/SPARK/components
+# A `primary=True` marker indicates the tag alone satisfies the merge-time
+# component requirement; non-primary JIRA components must be paired with a
+# primary one (e.g. [SQL][TEST], [PYTHON][PS], [CORE][SHUFFLE]). Status
+# markers leave `jira_name` empty.
+COMPONENTS = (
+ Component("BLOCK MANAGER", ("BLOCK_MANAGER",), jira_name="Block Manager"),
+ Component("BUILD", primary=True, jira_name="Build"),
+ Component("CONNECT", primary=True, jira_name="Connect"),
+ Component("CORE", ("SPARK CORE", "SPARK_CORE"), primary=True,
jira_name="Spark Core"),
+ Component("DEPLOY", jira_name="Deploy"),
+ Component("DOC", ("DOCS", "DOCUMENTATION"), primary=True,
jira_name="Documentation"),
+ Component("DOCKER", primary=True, jira_name="Spark Docker"),
+ Component("EC2", jira_name="EC2"),
+ Component("EXAMPLE", ("EXAMPLES",), jira_name="Examples"),
+ Component("GRAPHX", primary=True, jira_name="GraphX"),
+ Component("INFRA", ("PROJECT INFRA", "PROJECT_INFRA"), primary=True,
jira_name="Project Infra"),
+ Component("IO", jira_name="Input/Output"),
+ Component("JAVA", ("JAVA API", "JAVA_API", "JAVAAPI"), jira_name="Java
API"),
+ Component("K8S", ("KUBERNETES",), primary=True, jira_name="Kubernetes"),
+ Component("MESOS", jira_name="Mesos"),
+ Component("ML", primary=True, jira_name="ML"),
+ Component("MLLIB", primary=True, jira_name="MLlib"),
+ Component("OPTIMIZER", jira_name="Optimizer"),
+ Component("PROTOBUF", jira_name="Protobuf"),
+ Component("PS", primary=True, jira_name="Pandas API on Spark"),
+ Component("PYTHON", ("PYSPARK",), primary=True, jira_name="PySpark"),
+ Component("R", ("SPARKR",), primary=True, jira_name="R"),
+ Component("REPL", ("SHELL", "SPARK SHELL", "SPARK_SHELL"),
jira_name="Spark Shell"),
+ Component("SCHEDULER", jira_name="Scheduler"),
+ Component("SDP", ("PIPELINES",), primary=True, jira_name="Declarative
Pipelines"),
+ Component("SECURITY", primary=True, jira_name="Security"),
+ Component("SHUFFLE", jira_name="Shuffle"),
+ # Component("SPARKR", jira_name="SparkR"), # SPARKR is treated as an
alias of R above
Review Comment:
Dead commented-out registry entry — the alias is now declared on the `R` row
above (`Component("R", ("SPARKR",), ...)`), so this comment line can just be
deleted.
```suggestion
```
##########
dev/merge_spark_pr.py:
##########
@@ -700,77 +701,275 @@ def resolve_jira_issues(title, merge_branches, comment):
resolve_jira_issue(merge_branches, comment, jira_id)
-def standardize_jira_ref(text):
- """
- Standardize the [SPARK-XXXXX] [MODULE] prefix
- Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK
XXX [MLLIB]: Issue" to
- "[SPARK-XXX][MLLIB] Issue"
-
- >>> standardize_jira_ref(
- ... "[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete
is successful")
- '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is
successful'
- >>> standardize_jira_ref(
- ... "[SPARK-4123][Project Infra][WIP]: Show new dependencies added in
pull requests")
- '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull
requests'
- >>> standardize_jira_ref("[MLlib] Spark 5954: Top by key")
- '[SPARK-5954][MLLIB] Top by key'
- >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing
in TaskSchedulerImpl")
- '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl'
- >>> standardize_jira_ref(
- ... "SPARK-1094 Support MiMa for reporting binary compatibility across
versions.")
- '[SPARK-1094] Support MiMa for reporting binary compatibility across
versions.'
- >>> standardize_jira_ref("[WIP] [SPARK-1146] Vagrant support for Spark")
- '[SPARK-1146][WIP] Vagrant support for Spark'
- >>> standardize_jira_ref(
- ... "SPARK-1032. If Yarn app fails before registering, app master
stays aroun...")
- '[SPARK-1032] If Yarn app fails before registering, app master stays
aroun...'
- >>> standardize_jira_ref(
- ... "[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved
words in DDL parser.")
- '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in
DDL parser.'
- >>> standardize_jira_ref(
- ... 'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with
F.lit"')
- 'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with F.lit"'
- >>> standardize_jira_ref("Additional information for users building from
source code")
- 'Additional information for users building from source code'
+
+class Component:
+ """A Spark PR-title tag, paired with its canonical JIRA component name.
+
+ ``jira_name`` is the canonical name of the SPARK JIRA component (e.g.
+ "Documentation"); empty for status markers like [MINOR] that are not
+ JIRA components but are still recognized in PR titles.
+
+ ``tag`` is the preferred PR-title abbreviation (uppercase, no brackets,
+ e.g. "DOC"). ``aliases`` lists other accepted spellings that resolve to
+ the same component (e.g. "DOCS", "DOCUMENTATION" -> "DOC").
+
+ ``primary`` marks components whose presence alone satisfies the merge-time
+ requirement. Non-primary JIRA components (e.g. [TEST], [PS], [SHUFFLE])
+ remain recognized — they normalize and pass through validation — but
+ they must be paired with a primary tag (e.g. [SQL][TEST]). Status
+ markers are never primary. [WIP] is intentionally absent from the
+ registry: a WIP PR should be aborted at the earlier WIP warning, not
+ merged.
"""
- jira_refs = []
- components = []
- # If this is a Revert PR, no need to process any further
- if text.startswith('Revert "') and text.endswith('"'):
- return text
+ def __init__(self, tag, aliases=(), primary=False, jira_name=""):
+ self.tag = tag
+ self.aliases = frozenset(aliases)
+ self.primary = primary
+ self.jira_name = jira_name
- # If the string is compliant, no need to process any further
- if re.search(r"^\[SPARK-[0-9]{3,6}\](\[[A-Z0-9_\s,]+\] )+\S+", text):
- return text
+ def matches(self, token):
+ return token == self.tag or token in self.aliases
- # Extract JIRA ref(s):
- pattern = re.compile(r"(SPARK[-\s]*[0-9]{3,6})+", re.IGNORECASE)
- for ref in pattern.findall(text):
- # Add brackets, replace spaces with a dash, & convert to uppercase
- jira_refs.append("[" + re.sub(r"\s+", "-", ref.upper()) + "]")
- text = text.replace(ref, "")
+ @classmethod
+ def find(cls, token):
+ """Return the Component matching ``token`` (case-insensitive), or
None."""
+ if token is None:
+ return None
+ token = token.strip().upper()
+ for c in COMPONENTS:
+ if c.matches(token):
+ return c
+ return None
- # Extract spark component(s):
- # Look for alphanumeric chars, spaces, dashes, periods, and/or commas
- pattern = re.compile(r"(\[[\w\s,.-]+\])", re.IGNORECASE)
- for component in pattern.findall(text):
- components.append(component.upper())
- text = text.replace(component, "")
- # Cleanup any remaining symbols:
- pattern = re.compile(r"^\W+(.*)", re.IGNORECASE)
- if pattern.search(text) is not None:
- text = pattern.search(text).groups()[0]
- # Assemble full text (JIRA ref(s), module(s), remaining text)
- clean_text = "".join(jira_refs).strip() + "".join(components).strip() + "
" + text.strip()
+# Full SPARK JIRA component list (sorted alphabetically by tag), followed
+# by status markers. Keep in sync with the components in JIRA — fetch the
+# current list with:
+# curl -s https://issues.apache.org/jira/rest/api/2/project/SPARK/components
+# A `primary=True` marker indicates the tag alone satisfies the merge-time
+# component requirement; non-primary JIRA components must be paired with a
+# primary one (e.g. [SQL][TEST], [PYTHON][PS], [CORE][SHUFFLE]). Status
+# markers leave `jira_name` empty.
+COMPONENTS = (
+ Component("BLOCK MANAGER", ("BLOCK_MANAGER",), jira_name="Block Manager"),
+ Component("BUILD", primary=True, jira_name="Build"),
+ Component("CONNECT", primary=True, jira_name="Connect"),
+ Component("CORE", ("SPARK CORE", "SPARK_CORE"), primary=True,
jira_name="Spark Core"),
+ Component("DEPLOY", jira_name="Deploy"),
+ Component("DOC", ("DOCS", "DOCUMENTATION"), primary=True,
jira_name="Documentation"),
+ Component("DOCKER", primary=True, jira_name="Spark Docker"),
+ Component("EC2", jira_name="EC2"),
+ Component("EXAMPLE", ("EXAMPLES",), jira_name="Examples"),
+ Component("GRAPHX", primary=True, jira_name="GraphX"),
+ Component("INFRA", ("PROJECT INFRA", "PROJECT_INFRA"), primary=True,
jira_name="Project Infra"),
+ Component("IO", jira_name="Input/Output"),
+ Component("JAVA", ("JAVA API", "JAVA_API", "JAVAAPI"), jira_name="Java
API"),
+ Component("K8S", ("KUBERNETES",), primary=True, jira_name="Kubernetes"),
+ Component("MESOS", jira_name="Mesos"),
+ Component("ML", primary=True, jira_name="ML"),
+ Component("MLLIB", primary=True, jira_name="MLlib"),
+ Component("OPTIMIZER", jira_name="Optimizer"),
+ Component("PROTOBUF", jira_name="Protobuf"),
+ Component("PS", primary=True, jira_name="Pandas API on Spark"),
+ Component("PYTHON", ("PYSPARK",), primary=True, jira_name="PySpark"),
+ Component("R", ("SPARKR",), primary=True, jira_name="R"),
+ Component("REPL", ("SHELL", "SPARK SHELL", "SPARK_SHELL"),
jira_name="Spark Shell"),
+ Component("SCHEDULER", jira_name="Scheduler"),
+ Component("SDP", ("PIPELINES",), primary=True, jira_name="Declarative
Pipelines"),
+ Component("SECURITY", primary=True, jira_name="Security"),
+ Component("SHUFFLE", jira_name="Shuffle"),
+ # Component("SPARKR", jira_name="SparkR"), # SPARKR is treated as an
alias of R above
+ Component("SQL", primary=True, jira_name="SQL"),
+ Component("SS", primary=True, jira_name="Structured Streaming"),
+ Component("STREAMING", ("DSTREAM", "DSTREAMS"), primary=True,
jira_name="DStreams"),
+ Component("SUBMIT", jira_name="Spark Submit"),
+ Component("TEST", ("TESTS", "TEST-ONLY", "TESTS-ONLY"), jira_name="Tests"),
+ Component("UI", ("WEBUI", "WEB UI", "WEB_UI"), primary=True,
jira_name="Web UI"),
+ Component("WINDOWS", primary=True, jira_name="Windows"),
+ Component("YARN", primary=True, jira_name="YARN"),
+ # Status markers — recognized in PR titles, but not JIRA components.
+ Component("FOLLOWUP", ("FOLLOW-UP", "FOLLOW UP")),
+ Component("MINOR"),
+ Component("TRIVIAL"),
+)
+
+
+_BRACKET_TAG_RE = re.compile(r"\[\s*([A-Za-z0-9._-]+)\s*\]")
Review Comment:
`_BRACKET_TAG_RE` excludes internal spaces (`[A-Za-z0-9._-]+`), so every
alias/canonical with an embedded space is unreachable from `Title.parse` — only
resolvable through the `prompt_for_components` interactive path. That affects:
- canonical: `"BLOCK MANAGER"` (only the underscore alias is
parser-reachable)
- aliases: `"SPARK CORE"`, `"PROJECT INFRA"`, `"WEB UI"`, `"SPARK SHELL"`,
`"JAVA API"`, `"FOLLOW UP"`
Practical impact: in your reply to @LuciferYang you said `[PROJECT INFRA]`
in a title would automatically map to `[INFRA]`, but the parser actually leaves
it in the body text. I verified by importing the module:
```
>>> Title.parse("[SPARK-1234][PROJECT INFRA] Fix")
leading=['SPARK-1234'], components=[], text='[PROJECT INFRA] Fix'
>>> str(_)
'[SPARK-1234] [PROJECT INFRA] Fix'
```
The prompt fires (no primary), and the final title becomes
`[SPARK-1234][INFRA] [PROJECT INFRA] Fix`.
Your dry-run on 1000 commits + 200 PRs shows this form doesn't occur in
practice, so it's a theoretical regression only. Two options if you want to
close it cleanly: drop the with-space variants from the registry (since they're
parser-unreachable they only serve the interactive path), or extend the regex
to accept internal whitespace so the canonical form is consistent everywhere.
Up to you — happy to leave as-is if you prefer.
##########
dev/merge_spark_pr.py:
##########
@@ -700,77 +701,275 @@ def resolve_jira_issues(title, merge_branches, comment):
resolve_jira_issue(merge_branches, comment, jira_id)
-def standardize_jira_ref(text):
- """
- Standardize the [SPARK-XXXXX] [MODULE] prefix
- Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK
XXX [MLLIB]: Issue" to
- "[SPARK-XXX][MLLIB] Issue"
-
- >>> standardize_jira_ref(
- ... "[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete
is successful")
- '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is
successful'
- >>> standardize_jira_ref(
- ... "[SPARK-4123][Project Infra][WIP]: Show new dependencies added in
pull requests")
- '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull
requests'
- >>> standardize_jira_ref("[MLlib] Spark 5954: Top by key")
- '[SPARK-5954][MLLIB] Top by key'
- >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing
in TaskSchedulerImpl")
- '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl'
- >>> standardize_jira_ref(
- ... "SPARK-1094 Support MiMa for reporting binary compatibility across
versions.")
- '[SPARK-1094] Support MiMa for reporting binary compatibility across
versions.'
- >>> standardize_jira_ref("[WIP] [SPARK-1146] Vagrant support for Spark")
- '[SPARK-1146][WIP] Vagrant support for Spark'
- >>> standardize_jira_ref(
- ... "SPARK-1032. If Yarn app fails before registering, app master
stays aroun...")
- '[SPARK-1032] If Yarn app fails before registering, app master stays
aroun...'
- >>> standardize_jira_ref(
- ... "[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved
words in DDL parser.")
- '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in
DDL parser.'
- >>> standardize_jira_ref(
- ... 'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with
F.lit"')
- 'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with F.lit"'
- >>> standardize_jira_ref("Additional information for users building from
source code")
- 'Additional information for users building from source code'
+
+class Component:
+ """A Spark PR-title tag, paired with its canonical JIRA component name.
+
+ ``jira_name`` is the canonical name of the SPARK JIRA component (e.g.
+ "Documentation"); empty for status markers like [MINOR] that are not
+ JIRA components but are still recognized in PR titles.
+
+ ``tag`` is the preferred PR-title abbreviation (uppercase, no brackets,
+ e.g. "DOC"). ``aliases`` lists other accepted spellings that resolve to
+ the same component (e.g. "DOCS", "DOCUMENTATION" -> "DOC").
+
+ ``primary`` marks components whose presence alone satisfies the merge-time
+ requirement. Non-primary JIRA components (e.g. [TEST], [PS], [SHUFFLE])
+ remain recognized — they normalize and pass through validation — but
+ they must be paired with a primary tag (e.g. [SQL][TEST]). Status
+ markers are never primary. [WIP] is intentionally absent from the
+ registry: a WIP PR should be aborted at the earlier WIP warning, not
+ merged.
"""
- jira_refs = []
- components = []
- # If this is a Revert PR, no need to process any further
- if text.startswith('Revert "') and text.endswith('"'):
- return text
+ def __init__(self, tag, aliases=(), primary=False, jira_name=""):
+ self.tag = tag
+ self.aliases = frozenset(aliases)
+ self.primary = primary
+ self.jira_name = jira_name
- # If the string is compliant, no need to process any further
- if re.search(r"^\[SPARK-[0-9]{3,6}\](\[[A-Z0-9_\s,]+\] )+\S+", text):
- return text
+ def matches(self, token):
+ return token == self.tag or token in self.aliases
- # Extract JIRA ref(s):
- pattern = re.compile(r"(SPARK[-\s]*[0-9]{3,6})+", re.IGNORECASE)
- for ref in pattern.findall(text):
- # Add brackets, replace spaces with a dash, & convert to uppercase
- jira_refs.append("[" + re.sub(r"\s+", "-", ref.upper()) + "]")
- text = text.replace(ref, "")
+ @classmethod
+ def find(cls, token):
+ """Return the Component matching ``token`` (case-insensitive), or
None."""
+ if token is None:
+ return None
+ token = token.strip().upper()
+ for c in COMPONENTS:
+ if c.matches(token):
+ return c
+ return None
- # Extract spark component(s):
- # Look for alphanumeric chars, spaces, dashes, periods, and/or commas
- pattern = re.compile(r"(\[[\w\s,.-]+\])", re.IGNORECASE)
- for component in pattern.findall(text):
- components.append(component.upper())
- text = text.replace(component, "")
- # Cleanup any remaining symbols:
- pattern = re.compile(r"^\W+(.*)", re.IGNORECASE)
- if pattern.search(text) is not None:
- text = pattern.search(text).groups()[0]
- # Assemble full text (JIRA ref(s), module(s), remaining text)
- clean_text = "".join(jira_refs).strip() + "".join(components).strip() + "
" + text.strip()
+# Full SPARK JIRA component list (sorted alphabetically by tag), followed
+# by status markers. Keep in sync with the components in JIRA — fetch the
+# current list with:
+# curl -s https://issues.apache.org/jira/rest/api/2/project/SPARK/components
+# A `primary=True` marker indicates the tag alone satisfies the merge-time
+# component requirement; non-primary JIRA components must be paired with a
+# primary one (e.g. [SQL][TEST], [PYTHON][PS], [CORE][SHUFFLE]). Status
+# markers leave `jira_name` empty.
+COMPONENTS = (
+ Component("BLOCK MANAGER", ("BLOCK_MANAGER",), jira_name="Block Manager"),
+ Component("BUILD", primary=True, jira_name="Build"),
+ Component("CONNECT", primary=True, jira_name="Connect"),
+ Component("CORE", ("SPARK CORE", "SPARK_CORE"), primary=True,
jira_name="Spark Core"),
+ Component("DEPLOY", jira_name="Deploy"),
+ Component("DOC", ("DOCS", "DOCUMENTATION"), primary=True,
jira_name="Documentation"),
+ Component("DOCKER", primary=True, jira_name="Spark Docker"),
+ Component("EC2", jira_name="EC2"),
+ Component("EXAMPLE", ("EXAMPLES",), jira_name="Examples"),
+ Component("GRAPHX", primary=True, jira_name="GraphX"),
+ Component("INFRA", ("PROJECT INFRA", "PROJECT_INFRA"), primary=True,
jira_name="Project Infra"),
+ Component("IO", jira_name="Input/Output"),
+ Component("JAVA", ("JAVA API", "JAVA_API", "JAVAAPI"), jira_name="Java
API"),
+ Component("K8S", ("KUBERNETES",), primary=True, jira_name="Kubernetes"),
+ Component("MESOS", jira_name="Mesos"),
+ Component("ML", primary=True, jira_name="ML"),
+ Component("MLLIB", primary=True, jira_name="MLlib"),
+ Component("OPTIMIZER", jira_name="Optimizer"),
+ Component("PROTOBUF", jira_name="Protobuf"),
+ Component("PS", primary=True, jira_name="Pandas API on Spark"),
+ Component("PYTHON", ("PYSPARK",), primary=True, jira_name="PySpark"),
+ Component("R", ("SPARKR",), primary=True, jira_name="R"),
+ Component("REPL", ("SHELL", "SPARK SHELL", "SPARK_SHELL"),
jira_name="Spark Shell"),
+ Component("SCHEDULER", jira_name="Scheduler"),
+ Component("SDP", ("PIPELINES",), primary=True, jira_name="Declarative
Pipelines"),
+ Component("SECURITY", primary=True, jira_name="Security"),
+ Component("SHUFFLE", jira_name="Shuffle"),
+ # Component("SPARKR", jira_name="SparkR"), # SPARKR is treated as an
alias of R above
+ Component("SQL", primary=True, jira_name="SQL"),
+ Component("SS", primary=True, jira_name="Structured Streaming"),
+ Component("STREAMING", ("DSTREAM", "DSTREAMS"), primary=True,
jira_name="DStreams"),
+ Component("SUBMIT", jira_name="Spark Submit"),
+ Component("TEST", ("TESTS", "TEST-ONLY", "TESTS-ONLY"), jira_name="Tests"),
+ Component("UI", ("WEBUI", "WEB UI", "WEB_UI"), primary=True,
jira_name="Web UI"),
+ Component("WINDOWS", primary=True, jira_name="Windows"),
+ Component("YARN", primary=True, jira_name="YARN"),
+ # Status markers — recognized in PR titles, but not JIRA components.
+ Component("FOLLOWUP", ("FOLLOW-UP", "FOLLOW UP")),
+ Component("MINOR"),
+ Component("TRIVIAL"),
+)
+
+
+_BRACKET_TAG_RE = re.compile(r"\[\s*([A-Za-z0-9._-]+)\s*\]")
+_SPARK_ID_RE = re.compile(r"^SPARK-\d+$", re.IGNORECASE)
+_VERSION_TAG_RE = re.compile(r"^\d+\.(\d+|X)$")
+_LEADING_TAGS = frozenset({"MINOR", "TRIVIAL"})
+
+
+class Title:
Review Comment:
Same coverage gap I flagged last round: there is no doctest for the
empty-body case (`Title.parse("[SPARK-1234]")`). It parses fine
(`leading=['SPARK-1234'], components=[], text=''`) and round-trips through
`__str__` correctly, but a doctest would pin that behavior — and exercise the
`if not self.text: return parts` branch in `__str__`, which is otherwise
uncovered.
##########
dev/merge_spark_pr.py:
##########
@@ -700,77 +701,275 @@ def resolve_jira_issues(title, merge_branches, comment):
resolve_jira_issue(merge_branches, comment, jira_id)
-def standardize_jira_ref(text):
- """
- Standardize the [SPARK-XXXXX] [MODULE] prefix
- Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK
XXX [MLLIB]: Issue" to
- "[SPARK-XXX][MLLIB] Issue"
-
- >>> standardize_jira_ref(
- ... "[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete
is successful")
- '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is
successful'
- >>> standardize_jira_ref(
- ... "[SPARK-4123][Project Infra][WIP]: Show new dependencies added in
pull requests")
- '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull
requests'
- >>> standardize_jira_ref("[MLlib] Spark 5954: Top by key")
- '[SPARK-5954][MLLIB] Top by key'
- >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing
in TaskSchedulerImpl")
- '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl'
- >>> standardize_jira_ref(
- ... "SPARK-1094 Support MiMa for reporting binary compatibility across
versions.")
- '[SPARK-1094] Support MiMa for reporting binary compatibility across
versions.'
- >>> standardize_jira_ref("[WIP] [SPARK-1146] Vagrant support for Spark")
- '[SPARK-1146][WIP] Vagrant support for Spark'
- >>> standardize_jira_ref(
- ... "SPARK-1032. If Yarn app fails before registering, app master
stays aroun...")
- '[SPARK-1032] If Yarn app fails before registering, app master stays
aroun...'
- >>> standardize_jira_ref(
- ... "[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved
words in DDL parser.")
- '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in
DDL parser.'
- >>> standardize_jira_ref(
- ... 'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with
F.lit"')
- 'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with F.lit"'
- >>> standardize_jira_ref("Additional information for users building from
source code")
- 'Additional information for users building from source code'
+
+class Component:
+ """A Spark PR-title tag, paired with its canonical JIRA component name.
+
+ ``jira_name`` is the canonical name of the SPARK JIRA component (e.g.
+ "Documentation"); empty for status markers like [MINOR] that are not
+ JIRA components but are still recognized in PR titles.
+
+ ``tag`` is the preferred PR-title abbreviation (uppercase, no brackets,
+ e.g. "DOC"). ``aliases`` lists other accepted spellings that resolve to
+ the same component (e.g. "DOCS", "DOCUMENTATION" -> "DOC").
+
+ ``primary`` marks components whose presence alone satisfies the merge-time
+ requirement. Non-primary JIRA components (e.g. [TEST], [PS], [SHUFFLE])
+ remain recognized — they normalize and pass through validation — but
+ they must be paired with a primary tag (e.g. [SQL][TEST]). Status
+ markers are never primary. [WIP] is intentionally absent from the
+ registry: a WIP PR should be aborted at the earlier WIP warning, not
+ merged.
"""
- jira_refs = []
- components = []
- # If this is a Revert PR, no need to process any further
- if text.startswith('Revert "') and text.endswith('"'):
- return text
+ def __init__(self, tag, aliases=(), primary=False, jira_name=""):
+ self.tag = tag
+ self.aliases = frozenset(aliases)
+ self.primary = primary
+ self.jira_name = jira_name
- # If the string is compliant, no need to process any further
- if re.search(r"^\[SPARK-[0-9]{3,6}\](\[[A-Z0-9_\s,]+\] )+\S+", text):
- return text
+ def matches(self, token):
+ return token == self.tag or token in self.aliases
- # Extract JIRA ref(s):
- pattern = re.compile(r"(SPARK[-\s]*[0-9]{3,6})+", re.IGNORECASE)
- for ref in pattern.findall(text):
- # Add brackets, replace spaces with a dash, & convert to uppercase
- jira_refs.append("[" + re.sub(r"\s+", "-", ref.upper()) + "]")
- text = text.replace(ref, "")
+ @classmethod
+ def find(cls, token):
+ """Return the Component matching ``token`` (case-insensitive), or
None."""
+ if token is None:
+ return None
+ token = token.strip().upper()
+ for c in COMPONENTS:
+ if c.matches(token):
+ return c
+ return None
- # Extract spark component(s):
- # Look for alphanumeric chars, spaces, dashes, periods, and/or commas
- pattern = re.compile(r"(\[[\w\s,.-]+\])", re.IGNORECASE)
- for component in pattern.findall(text):
- components.append(component.upper())
- text = text.replace(component, "")
- # Cleanup any remaining symbols:
- pattern = re.compile(r"^\W+(.*)", re.IGNORECASE)
- if pattern.search(text) is not None:
- text = pattern.search(text).groups()[0]
- # Assemble full text (JIRA ref(s), module(s), remaining text)
- clean_text = "".join(jira_refs).strip() + "".join(components).strip() + "
" + text.strip()
+# Full SPARK JIRA component list (sorted alphabetically by tag), followed
+# by status markers. Keep in sync with the components in JIRA — fetch the
+# current list with:
+# curl -s https://issues.apache.org/jira/rest/api/2/project/SPARK/components
+# A `primary=True` marker indicates the tag alone satisfies the merge-time
+# component requirement; non-primary JIRA components must be paired with a
+# primary one (e.g. [SQL][TEST], [PYTHON][PS], [CORE][SHUFFLE]). Status
+# markers leave `jira_name` empty.
+COMPONENTS = (
+ Component("BLOCK MANAGER", ("BLOCK_MANAGER",), jira_name="Block Manager"),
+ Component("BUILD", primary=True, jira_name="Build"),
+ Component("CONNECT", primary=True, jira_name="Connect"),
+ Component("CORE", ("SPARK CORE", "SPARK_CORE"), primary=True,
jira_name="Spark Core"),
+ Component("DEPLOY", jira_name="Deploy"),
+ Component("DOC", ("DOCS", "DOCUMENTATION"), primary=True,
jira_name="Documentation"),
+ Component("DOCKER", primary=True, jira_name="Spark Docker"),
+ Component("EC2", jira_name="EC2"),
+ Component("EXAMPLE", ("EXAMPLES",), jira_name="Examples"),
+ Component("GRAPHX", primary=True, jira_name="GraphX"),
+ Component("INFRA", ("PROJECT INFRA", "PROJECT_INFRA"), primary=True,
jira_name="Project Infra"),
+ Component("IO", jira_name="Input/Output"),
+ Component("JAVA", ("JAVA API", "JAVA_API", "JAVAAPI"), jira_name="Java
API"),
+ Component("K8S", ("KUBERNETES",), primary=True, jira_name="Kubernetes"),
+ Component("MESOS", jira_name="Mesos"),
+ Component("ML", primary=True, jira_name="ML"),
+ Component("MLLIB", primary=True, jira_name="MLlib"),
+ Component("OPTIMIZER", jira_name="Optimizer"),
+ Component("PROTOBUF", jira_name="Protobuf"),
+ Component("PS", primary=True, jira_name="Pandas API on Spark"),
+ Component("PYTHON", ("PYSPARK",), primary=True, jira_name="PySpark"),
+ Component("R", ("SPARKR",), primary=True, jira_name="R"),
+ Component("REPL", ("SHELL", "SPARK SHELL", "SPARK_SHELL"),
jira_name="Spark Shell"),
+ Component("SCHEDULER", jira_name="Scheduler"),
+ Component("SDP", ("PIPELINES",), primary=True, jira_name="Declarative
Pipelines"),
+ Component("SECURITY", primary=True, jira_name="Security"),
+ Component("SHUFFLE", jira_name="Shuffle"),
+ # Component("SPARKR", jira_name="SparkR"), # SPARKR is treated as an
alias of R above
+ Component("SQL", primary=True, jira_name="SQL"),
+ Component("SS", primary=True, jira_name="Structured Streaming"),
+ Component("STREAMING", ("DSTREAM", "DSTREAMS"), primary=True,
jira_name="DStreams"),
+ Component("SUBMIT", jira_name="Spark Submit"),
+ Component("TEST", ("TESTS", "TEST-ONLY", "TESTS-ONLY"), jira_name="Tests"),
+ Component("UI", ("WEBUI", "WEB UI", "WEB_UI"), primary=True,
jira_name="Web UI"),
+ Component("WINDOWS", primary=True, jira_name="Windows"),
+ Component("YARN", primary=True, jira_name="YARN"),
+ # Status markers — recognized in PR titles, but not JIRA components.
+ Component("FOLLOWUP", ("FOLLOW-UP", "FOLLOW UP")),
+ Component("MINOR"),
+ Component("TRIVIAL"),
+)
+
+
+_BRACKET_TAG_RE = re.compile(r"\[\s*([A-Za-z0-9._-]+)\s*\]")
+_SPARK_ID_RE = re.compile(r"^SPARK-\d+$", re.IGNORECASE)
+_VERSION_TAG_RE = re.compile(r"^\d+\.(\d+|X)$")
+_LEADING_TAGS = frozenset({"MINOR", "TRIVIAL"})
+
+
+class Title:
+ """Structured PR title: SPARK refs, component tags, and body.
+
+ ``leading`` — SPARK-NNNNN IDs and [MINOR]/[TRIVIAL] markers, in order.
+ ``components`` — all other bracket tags, in order.
+ ``text`` — body text following the bracket sequence.
+
+ >>> t = Title.parse("[SPARK-1234][SQL] Fix something")
+ >>> t.leading, t.components, t.text
+ (['SPARK-1234'], ['SQL'], 'Fix something')
+ >>> str(t)
+ '[SPARK-1234][SQL] Fix something'
+ >>> t = Title.parse("[SPARK-1234][SQL][FOLLOWUP] Fix something")
+ >>> t.leading, t.components, t.text
+ (['SPARK-1234'], ['SQL', 'FOLLOWUP'], 'Fix something')
+ >>> str(t)
+ '[SPARK-1234][SQL][FOLLOWUP] Fix something'
+ """
- # Replace multiple spaces with a single space, e.g. if no jira refs and/or
components were
- # included
- clean_text = re.sub(r"\s+", " ", clean_text.strip())
+ def __init__(
+ self,
+ leading: List[str],
+ components: List[str],
+ text: str,
+ ) -> None:
+ self.leading = leading
+ self.components = components
+ self.text = text
+
+ @classmethod
+ def parse(cls, raw: str) -> "Title":
+ """Parse a PR title string into a :class:`Title`.
+
+ A title must open with a leading tag ([SPARK-NNNNN], [MINOR], or
+ [TRIVIAL]); otherwise :exc:`ValueError` is raised. Subsequent bracket
+ tokens (spaces trimmed, separated by optional whitespace) go to
+ ``components``. The remainder is ``text``.
+
+ >>> t = Title.parse("[SPARK-1234][SQL][TESTS] Fix something")
+ >>> t.leading, t.components, t.text
+ (['SPARK-1234'], ['SQL', 'TESTS'], 'Fix something')
+ >>> t = Title.parse(" [ SPARK-1234 ] [ SQL ] [ TESTS ] Fix
something")
+ >>> t.leading, t.components, t.text
+ (['SPARK-1234'], ['SQL', 'TESTS'], 'Fix something')
+ >>> t = Title.parse("[SPARK-1234 ][ sql ][ followup ] Fix")
+ >>> t.leading, t.components, t.text
+ (['SPARK-1234'], ['SQL', 'FOLLOWUP'], 'Fix')
+ >>> str(t)
+ '[SPARK-1234][SQL][FOLLOWUP] Fix'
+ >>> Title.parse("[MINOR] Fix typo").leading
+ ['MINOR']
+ >>> t = Title.parse("[spark-1234][sql][followup] Fix")
+ >>> t.leading, t.components
+ (['SPARK-1234'], ['SQL', 'FOLLOWUP'])
+ >>> Title.parse("[SPARK-1234][SPARK-5678][SQL] Fix").leading
+ ['SPARK-1234', 'SPARK-5678']
+ >>> Title.parse("[SPARK-1234][4.X][SQL] Fix").components
+ ['4.X', 'SQL']
+ >>> Title.parse("[SPARK-1234][SQL][4.2] Fix").components
+ ['SQL', '4.2']
+ >>> Title.parse("[SQL] Fix")
+ Traceback (most recent call last):
+ ...
+ ValueError: title must start with [SPARK-NNNNN], [MINOR], or
[TRIVIAL]: '[SQL] Fix'
+ >>> Title.parse("No brackets")
+ Traceback (most recent call last):
+ ...
+ ValueError: title must start with [SPARK-NNNNN], [MINOR], or
[TRIVIAL]: 'No brackets'
+ >>> Title.parse("[SPARK-1234][SQL][SPARK-123] Fix")
+ Traceback (most recent call last):
+ ...
+ ValueError: [SPARK-NNNNN] tags must all appear before other tags:
'[SPARK-1234][SQL][SPARK-123] Fix'
+ >>> Title.parse("[SPARK-1234][MINOR][SQL] Fix")
+ Traceback (most recent call last):
+ ...
+ ValueError: [SPARK-NNNNN], [MINOR], and [TRIVIAL] cannot coexist
+ >>> Title.parse("[MINOR][TRIVIAL][SQL] Fix")
+ Traceback (most recent call last):
+ ...
+ ValueError: [SPARK-NNNNN], [MINOR], and [TRIVIAL] cannot coexist
+ """
+ leading: List[str] = []
+ components: List[str] = []
+
+ raw = raw.strip()
+ m0 = _BRACKET_TAG_RE.match(raw)
+ first = m0.group(1).upper() if m0 else ""
+ if not (_SPARK_ID_RE.match(first) or first in _LEADING_TAGS):
+ raise ValueError(
+ "title must start with [SPARK-NNNNN], [MINOR], or [TRIVIAL]:
%r" % raw
+ )
- return clean_text
+ past_leading = False
+ pos = 0
+ while pos < len(raw):
+ m = _BRACKET_TAG_RE.match(raw, pos)
+ if not m:
+ break
+ tag = m.group(1).upper()
+ if _SPARK_ID_RE.match(tag):
+ if past_leading:
+ raise ValueError(
+ "[SPARK-NNNNN] tags must all appear before other tags:
%r" % raw
+ )
+ leading.append(tag)
+ elif tag in _LEADING_TAGS:
+ leading.append(tag)
+ else:
+ components.append(tag)
+ past_leading = True
+ pos = m.end()
+ while pos < len(raw) and raw[pos] == " ":
+ pos += 1
+
+ text = raw[pos:].lstrip()
+ markers = [t for t in leading if t in _LEADING_TAGS]
+ if len(markers) > 1 or (markers and len(leading) > len(markers)):
+ raise ValueError(
+ "[SPARK-NNNNN], [MINOR], and [TRIVIAL] cannot coexist"
+ )
+ return cls(leading, components, text)
+
+ def __str__(self) -> str:
+ parts = "".join("[%s]" % t for t in self.leading)
+ parts += "".join("[%s]" % c for c in self.components)
+ if not self.text:
+ return parts
+ return parts + (" " if parts else "") + self.text
+
+
+def prompt_for_components():
+ """
+ Prompt the committer for component(s) when the PR title lacks a primary
+ component. Each entered token is normalized via Component.find
+ (e.g. "DOCS" -> "DOC", "PYSPARK" -> "PYTHON"). Unrecognized tokens are
+ passed through as-is. Re-prompts until at least one entered token resolves
+ to a tag in PRIMARY_COMPONENTS. Returns an uppercase list of tags in
Review Comment:
Stale name — there is no `PRIMARY_COMPONENTS` constant in the module;
primariness is encoded on each `Component` via the `primary` attribute.
Suggested rewording:
```suggestion
to a primary Component (one with primary=True). Returns an uppercase
list of tags in
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]