Re: [PR] [SPARK-56979][INFRA] Require COMPONENT tag in PR title at merge time [spark]

via GitHub Sun, 24 May 2026 20:32:35 -0700


zhengruifeng commented on code in PR #56026:
URL: https://github.com/apache/spark/pull/56026#discussion_r3295918145



##########
dev/merge_spark_pr.py:
##########
@@ -700,77 +701,275 @@ def resolve_jira_issues(title, merge_branches, comment):
         resolve_jira_issue(merge_branches, comment, jira_id)
 
 
-def standardize_jira_ref(text):
-    """
-    Standardize the [SPARK-XXXXX] [MODULE] prefix
-    Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK 
XXX [MLLIB]: Issue" to
-    "[SPARK-XXX][MLLIB] Issue"
-
-    >>> standardize_jira_ref(
-    ...     "[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete 
is successful")
-    '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is 
successful'
-    >>> standardize_jira_ref(
-    ...     "[SPARK-4123][Project Infra][WIP]: Show new dependencies added in 
pull requests")
-    '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull 
requests'
-    >>> standardize_jira_ref("[MLlib] Spark  5954: Top by key")
-    '[SPARK-5954][MLLIB] Top by key'
-    >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing 
in TaskSchedulerImpl")
-    '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl'
-    >>> standardize_jira_ref(
-    ...     "SPARK-1094 Support MiMa for reporting binary compatibility across 
versions.")
-    '[SPARK-1094] Support MiMa for reporting binary compatibility across 
versions.'
-    >>> standardize_jira_ref("[WIP]  [SPARK-1146] Vagrant support for Spark")
-    '[SPARK-1146][WIP] Vagrant support for Spark'
-    >>> standardize_jira_ref(
-    ...     "SPARK-1032. If Yarn app fails before registering, app master 
stays aroun...")
-    '[SPARK-1032] If Yarn app fails before registering, app master stays 
aroun...'
-    >>> standardize_jira_ref(
-    ...     "[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved 
words in DDL parser.")
-    '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in 
DDL parser.'
-    >>> standardize_jira_ref(
-    ...     'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with 
F.lit"')
-    'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with F.lit"'
-    >>> standardize_jira_ref("Additional information for users building from 
source code")
-    'Additional information for users building from source code'
+
+class Component:
+    """A Spark PR-title tag, paired with its canonical JIRA component name.
+
+    ``jira_name`` is the canonical name of the SPARK JIRA component (e.g.
+    "Documentation"); empty for status markers like [MINOR] that are not
+    JIRA components but are still recognized in PR titles.
+
+    ``tag`` is the preferred PR-title abbreviation (uppercase, no brackets,
+    e.g. "DOC"). ``aliases`` lists other accepted spellings that resolve to
+    the same component (e.g. "DOCS", "DOCUMENTATION" -> "DOC").
+
+    ``primary`` marks components whose presence alone satisfies the merge-time
+    requirement. Non-primary JIRA components (e.g. [TEST], [PS], [SHUFFLE])

Review Comment:
   Done — updated to use  (still non-primary). Thanks!



##########
dev/merge_spark_pr.py:
##########
@@ -700,77 +701,275 @@ def resolve_jira_issues(title, merge_branches, comment):
         resolve_jira_issue(merge_branches, comment, jira_id)
 
 
-def standardize_jira_ref(text):
-    """
-    Standardize the [SPARK-XXXXX] [MODULE] prefix
-    Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK 
XXX [MLLIB]: Issue" to
-    "[SPARK-XXX][MLLIB] Issue"
-
-    >>> standardize_jira_ref(
-    ...     "[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete 
is successful")
-    '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is 
successful'
-    >>> standardize_jira_ref(
-    ...     "[SPARK-4123][Project Infra][WIP]: Show new dependencies added in 
pull requests")
-    '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull 
requests'
-    >>> standardize_jira_ref("[MLlib] Spark  5954: Top by key")
-    '[SPARK-5954][MLLIB] Top by key'
-    >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing 
in TaskSchedulerImpl")
-    '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl'
-    >>> standardize_jira_ref(
-    ...     "SPARK-1094 Support MiMa for reporting binary compatibility across 
versions.")
-    '[SPARK-1094] Support MiMa for reporting binary compatibility across 
versions.'
-    >>> standardize_jira_ref("[WIP]  [SPARK-1146] Vagrant support for Spark")
-    '[SPARK-1146][WIP] Vagrant support for Spark'
-    >>> standardize_jira_ref(
-    ...     "SPARK-1032. If Yarn app fails before registering, app master 
stays aroun...")
-    '[SPARK-1032] If Yarn app fails before registering, app master stays 
aroun...'
-    >>> standardize_jira_ref(
-    ...     "[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved 
words in DDL parser.")
-    '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in 
DDL parser.'
-    >>> standardize_jira_ref(
-    ...     'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with 
F.lit"')
-    'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with F.lit"'
-    >>> standardize_jira_ref("Additional information for users building from 
source code")
-    'Additional information for users building from source code'
+
+class Component:
+    """A Spark PR-title tag, paired with its canonical JIRA component name.
+
+    ``jira_name`` is the canonical name of the SPARK JIRA component (e.g.
+    "Documentation"); empty for status markers like [MINOR] that are not
+    JIRA components but are still recognized in PR titles.
+
+    ``tag`` is the preferred PR-title abbreviation (uppercase, no brackets,
+    e.g. "DOC"). ``aliases`` lists other accepted spellings that resolve to
+    the same component (e.g. "DOCS", "DOCUMENTATION" -> "DOC").
+
+    ``primary`` marks components whose presence alone satisfies the merge-time
+    requirement. Non-primary JIRA components (e.g. [TEST], [PS], [SHUFFLE])
+    remain recognized — they normalize and pass through validation — but
+    they must be paired with a primary tag (e.g. [SQL][TEST]). Status
+    markers are never primary. [WIP] is intentionally absent from the
+    registry: a WIP PR should be aborted at the earlier WIP warning, not
+    merged.
     """
-    jira_refs = []
-    components = []
 
-    # If this is a Revert PR, no need to process any further
-    if text.startswith('Revert "') and text.endswith('"'):
-        return text
+    def __init__(self, tag, aliases=(), primary=False, jira_name=""):
+        self.tag = tag
+        self.aliases = frozenset(aliases)
+        self.primary = primary
+        self.jira_name = jira_name
 
-    # If the string is compliant, no need to process any further
-    if re.search(r"^\[SPARK-[0-9]{3,6}\](\[[A-Z0-9_\s,]+\] )+\S+", text):
-        return text
+    def matches(self, token):
+        return token == self.tag or token in self.aliases
 
-    # Extract JIRA ref(s):
-    pattern = re.compile(r"(SPARK[-\s]*[0-9]{3,6})+", re.IGNORECASE)
-    for ref in pattern.findall(text):
-        # Add brackets, replace spaces with a dash, & convert to uppercase
-        jira_refs.append("[" + re.sub(r"\s+", "-", ref.upper()) + "]")
-        text = text.replace(ref, "")
+    @classmethod
+    def find(cls, token):
+        """Return the Component matching ``token`` (case-insensitive), or 
None."""
+        if token is None:
+            return None
+        token = token.strip().upper()
+        for c in COMPONENTS:
+            if c.matches(token):
+                return c
+        return None
 
-    # Extract spark component(s):
-    # Look for alphanumeric chars, spaces, dashes, periods, and/or commas
-    pattern = re.compile(r"(\[[\w\s,.-]+\])", re.IGNORECASE)
-    for component in pattern.findall(text):
-        components.append(component.upper())
-        text = text.replace(component, "")
 
-    # Cleanup any remaining symbols:
-    pattern = re.compile(r"^\W+(.*)", re.IGNORECASE)
-    if pattern.search(text) is not None:
-        text = pattern.search(text).groups()[0]
 
-    # Assemble full text (JIRA ref(s), module(s), remaining text)
-    clean_text = "".join(jira_refs).strip() + "".join(components).strip() + " 
" + text.strip()
+# Full SPARK JIRA component list (sorted alphabetically by tag), followed
+# by status markers. Keep in sync with the components in JIRA — fetch the
+# current list with:
+#   curl -s https://issues.apache.org/jira/rest/api/2/project/SPARK/components
+# A `primary=True` marker indicates the tag alone satisfies the merge-time
+# component requirement; non-primary JIRA components must be paired with a
+# primary one (e.g. [SQL][TEST], [PYTHON][PS], [CORE][SHUFFLE]). Status
+# markers leave `jira_name` empty.
+COMPONENTS = (
+    Component("BLOCK MANAGER", ("BLOCK_MANAGER",), jira_name="Block Manager"),
+    Component("BUILD", primary=True, jira_name="Build"),
+    Component("CONNECT", primary=True, jira_name="Connect"),
+    Component("CORE", ("SPARK CORE", "SPARK_CORE"), primary=True, 
jira_name="Spark Core"),
+    Component("DEPLOY", jira_name="Deploy"),
+    Component("DOC", ("DOCS", "DOCUMENTATION"), primary=True, 
jira_name="Documentation"),
+    Component("DOCKER", primary=True, jira_name="Spark Docker"),
+    Component("EC2", jira_name="EC2"),
+    Component("EXAMPLE", ("EXAMPLES",), jira_name="Examples"),
+    Component("GRAPHX", primary=True, jira_name="GraphX"),
+    Component("INFRA", ("PROJECT INFRA", "PROJECT_INFRA"), primary=True, 
jira_name="Project Infra"),
+    Component("IO", jira_name="Input/Output"),
+    Component("JAVA", ("JAVA API", "JAVA_API", "JAVAAPI"), jira_name="Java 
API"),
+    Component("K8S", ("KUBERNETES",), primary=True, jira_name="Kubernetes"),
+    Component("MESOS", jira_name="Mesos"),
+    Component("ML", primary=True, jira_name="ML"),
+    Component("MLLIB", primary=True, jira_name="MLlib"),
+    Component("OPTIMIZER", jira_name="Optimizer"),
+    Component("PROTOBUF", jira_name="Protobuf"),
+    Component("PS", primary=True, jira_name="Pandas API on Spark"),
+    Component("PYTHON", ("PYSPARK",), primary=True, jira_name="PySpark"),
+    Component("R", ("SPARKR",), primary=True, jira_name="R"),
+    Component("REPL", ("SHELL", "SPARK SHELL", "SPARK_SHELL"), 
jira_name="Spark Shell"),
+    Component("SCHEDULER", jira_name="Scheduler"),
+    Component("SDP", ("PIPELINES",), primary=True, jira_name="Declarative 
Pipelines"),
+    Component("SECURITY", primary=True, jira_name="Security"),
+    Component("SHUFFLE", jira_name="Shuffle"),
+    # Component("SPARKR", jira_name="SparkR"),  # SPARKR is treated as an 
alias of R above

Review Comment:
   Done — removed the dead comment line.



##########
dev/merge_spark_pr.py:
##########
@@ -700,77 +701,275 @@ def resolve_jira_issues(title, merge_branches, comment):
         resolve_jira_issue(merge_branches, comment, jira_id)
 
 
-def standardize_jira_ref(text):
-    """
-    Standardize the [SPARK-XXXXX] [MODULE] prefix
-    Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK 
XXX [MLLIB]: Issue" to
-    "[SPARK-XXX][MLLIB] Issue"
-
-    >>> standardize_jira_ref(
-    ...     "[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete 
is successful")
-    '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is 
successful'
-    >>> standardize_jira_ref(
-    ...     "[SPARK-4123][Project Infra][WIP]: Show new dependencies added in 
pull requests")
-    '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull 
requests'
-    >>> standardize_jira_ref("[MLlib] Spark  5954: Top by key")
-    '[SPARK-5954][MLLIB] Top by key'
-    >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing 
in TaskSchedulerImpl")
-    '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl'
-    >>> standardize_jira_ref(
-    ...     "SPARK-1094 Support MiMa for reporting binary compatibility across 
versions.")
-    '[SPARK-1094] Support MiMa for reporting binary compatibility across 
versions.'
-    >>> standardize_jira_ref("[WIP]  [SPARK-1146] Vagrant support for Spark")
-    '[SPARK-1146][WIP] Vagrant support for Spark'
-    >>> standardize_jira_ref(
-    ...     "SPARK-1032. If Yarn app fails before registering, app master 
stays aroun...")
-    '[SPARK-1032] If Yarn app fails before registering, app master stays 
aroun...'
-    >>> standardize_jira_ref(
-    ...     "[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved 
words in DDL parser.")
-    '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in 
DDL parser.'
-    >>> standardize_jira_ref(
-    ...     'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with 
F.lit"')
-    'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with F.lit"'
-    >>> standardize_jira_ref("Additional information for users building from 
source code")
-    'Additional information for users building from source code'
+
+class Component:
+    """A Spark PR-title tag, paired with its canonical JIRA component name.
+
+    ``jira_name`` is the canonical name of the SPARK JIRA component (e.g.
+    "Documentation"); empty for status markers like [MINOR] that are not
+    JIRA components but are still recognized in PR titles.
+
+    ``tag`` is the preferred PR-title abbreviation (uppercase, no brackets,
+    e.g. "DOC"). ``aliases`` lists other accepted spellings that resolve to
+    the same component (e.g. "DOCS", "DOCUMENTATION" -> "DOC").
+
+    ``primary`` marks components whose presence alone satisfies the merge-time
+    requirement. Non-primary JIRA components (e.g. [TEST], [PS], [SHUFFLE])
+    remain recognized — they normalize and pass through validation — but
+    they must be paired with a primary tag (e.g. [SQL][TEST]). Status
+    markers are never primary. [WIP] is intentionally absent from the
+    registry: a WIP PR should be aborted at the earlier WIP warning, not
+    merged.
     """
-    jira_refs = []
-    components = []
 
-    # If this is a Revert PR, no need to process any further
-    if text.startswith('Revert "') and text.endswith('"'):
-        return text
+    def __init__(self, tag, aliases=(), primary=False, jira_name=""):
+        self.tag = tag
+        self.aliases = frozenset(aliases)
+        self.primary = primary
+        self.jira_name = jira_name
 
-    # If the string is compliant, no need to process any further
-    if re.search(r"^\[SPARK-[0-9]{3,6}\](\[[A-Z0-9_\s,]+\] )+\S+", text):
-        return text
+    def matches(self, token):
+        return token == self.tag or token in self.aliases
 
-    # Extract JIRA ref(s):
-    pattern = re.compile(r"(SPARK[-\s]*[0-9]{3,6})+", re.IGNORECASE)
-    for ref in pattern.findall(text):
-        # Add brackets, replace spaces with a dash, & convert to uppercase
-        jira_refs.append("[" + re.sub(r"\s+", "-", ref.upper()) + "]")
-        text = text.replace(ref, "")
+    @classmethod
+    def find(cls, token):
+        """Return the Component matching ``token`` (case-insensitive), or 
None."""
+        if token is None:
+            return None
+        token = token.strip().upper()
+        for c in COMPONENTS:
+            if c.matches(token):
+                return c
+        return None
 
-    # Extract spark component(s):
-    # Look for alphanumeric chars, spaces, dashes, periods, and/or commas
-    pattern = re.compile(r"(\[[\w\s,.-]+\])", re.IGNORECASE)
-    for component in pattern.findall(text):
-        components.append(component.upper())
-        text = text.replace(component, "")
 
-    # Cleanup any remaining symbols:
-    pattern = re.compile(r"^\W+(.*)", re.IGNORECASE)
-    if pattern.search(text) is not None:
-        text = pattern.search(text).groups()[0]
 
-    # Assemble full text (JIRA ref(s), module(s), remaining text)
-    clean_text = "".join(jira_refs).strip() + "".join(components).strip() + " 
" + text.strip()
+# Full SPARK JIRA component list (sorted alphabetically by tag), followed
+# by status markers. Keep in sync with the components in JIRA — fetch the
+# current list with:
+#   curl -s https://issues.apache.org/jira/rest/api/2/project/SPARK/components
+# A `primary=True` marker indicates the tag alone satisfies the merge-time
+# component requirement; non-primary JIRA components must be paired with a
+# primary one (e.g. [SQL][TEST], [PYTHON][PS], [CORE][SHUFFLE]). Status
+# markers leave `jira_name` empty.
+COMPONENTS = (
+    Component("BLOCK MANAGER", ("BLOCK_MANAGER",), jira_name="Block Manager"),
+    Component("BUILD", primary=True, jira_name="Build"),
+    Component("CONNECT", primary=True, jira_name="Connect"),
+    Component("CORE", ("SPARK CORE", "SPARK_CORE"), primary=True, 
jira_name="Spark Core"),
+    Component("DEPLOY", jira_name="Deploy"),
+    Component("DOC", ("DOCS", "DOCUMENTATION"), primary=True, 
jira_name="Documentation"),
+    Component("DOCKER", primary=True, jira_name="Spark Docker"),
+    Component("EC2", jira_name="EC2"),
+    Component("EXAMPLE", ("EXAMPLES",), jira_name="Examples"),
+    Component("GRAPHX", primary=True, jira_name="GraphX"),
+    Component("INFRA", ("PROJECT INFRA", "PROJECT_INFRA"), primary=True, 
jira_name="Project Infra"),
+    Component("IO", jira_name="Input/Output"),
+    Component("JAVA", ("JAVA API", "JAVA_API", "JAVAAPI"), jira_name="Java 
API"),
+    Component("K8S", ("KUBERNETES",), primary=True, jira_name="Kubernetes"),
+    Component("MESOS", jira_name="Mesos"),
+    Component("ML", primary=True, jira_name="ML"),
+    Component("MLLIB", primary=True, jira_name="MLlib"),
+    Component("OPTIMIZER", jira_name="Optimizer"),
+    Component("PROTOBUF", jira_name="Protobuf"),
+    Component("PS", primary=True, jira_name="Pandas API on Spark"),
+    Component("PYTHON", ("PYSPARK",), primary=True, jira_name="PySpark"),
+    Component("R", ("SPARKR",), primary=True, jira_name="R"),
+    Component("REPL", ("SHELL", "SPARK SHELL", "SPARK_SHELL"), 
jira_name="Spark Shell"),
+    Component("SCHEDULER", jira_name="Scheduler"),
+    Component("SDP", ("PIPELINES",), primary=True, jira_name="Declarative 
Pipelines"),
+    Component("SECURITY", primary=True, jira_name="Security"),
+    Component("SHUFFLE", jira_name="Shuffle"),
+    # Component("SPARKR", jira_name="SparkR"),  # SPARKR is treated as an 
alias of R above
+    Component("SQL", primary=True, jira_name="SQL"),
+    Component("SS", primary=True, jira_name="Structured Streaming"),
+    Component("STREAMING", ("DSTREAM", "DSTREAMS"), primary=True, 
jira_name="DStreams"),
+    Component("SUBMIT", jira_name="Spark Submit"),
+    Component("TEST", ("TESTS", "TEST-ONLY", "TESTS-ONLY"), jira_name="Tests"),
+    Component("UI", ("WEBUI", "WEB UI", "WEB_UI"), primary=True, 
jira_name="Web UI"),
+    Component("WINDOWS", primary=True, jira_name="Windows"),
+    Component("YARN", primary=True, jira_name="YARN"),
+    # Status markers — recognized in PR titles, but not JIRA components.
+    Component("FOLLOWUP", ("FOLLOW-UP", "FOLLOW UP")),
+    Component("MINOR"),
+    Component("TRIVIAL"),
+)
+
+
+_BRACKET_TAG_RE = re.compile(r"\[\s*([A-Za-z0-9._-]+)\s*\]")

Review Comment:
   Good catch — went with the first option and dropped the parser-unreachable 
with-space variants (`SPARK CORE`, `PROJECT INFRA`, `JAVA API`, `SPARK SHELL`, 
`WEB UI`, `FOLLOW UP`). Also flipped `BLOCK MANAGER` → `BLOCK_MANAGER` so the 
canonical tag is parser-reachable. The remaining underscore aliases stay so the 
interactive prompt still accepts `SPARK_CORE`, `PROJECT_INFRA`, etc.



##########
dev/merge_spark_pr.py:
##########
@@ -700,77 +701,275 @@ def resolve_jira_issues(title, merge_branches, comment):
         resolve_jira_issue(merge_branches, comment, jira_id)
 
 
-def standardize_jira_ref(text):
-    """
-    Standardize the [SPARK-XXXXX] [MODULE] prefix
-    Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK 
XXX [MLLIB]: Issue" to
-    "[SPARK-XXX][MLLIB] Issue"
-
-    >>> standardize_jira_ref(
-    ...     "[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete 
is successful")
-    '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is 
successful'
-    >>> standardize_jira_ref(
-    ...     "[SPARK-4123][Project Infra][WIP]: Show new dependencies added in 
pull requests")
-    '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull 
requests'
-    >>> standardize_jira_ref("[MLlib] Spark  5954: Top by key")
-    '[SPARK-5954][MLLIB] Top by key'
-    >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing 
in TaskSchedulerImpl")
-    '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl'
-    >>> standardize_jira_ref(
-    ...     "SPARK-1094 Support MiMa for reporting binary compatibility across 
versions.")
-    '[SPARK-1094] Support MiMa for reporting binary compatibility across 
versions.'
-    >>> standardize_jira_ref("[WIP]  [SPARK-1146] Vagrant support for Spark")
-    '[SPARK-1146][WIP] Vagrant support for Spark'
-    >>> standardize_jira_ref(
-    ...     "SPARK-1032. If Yarn app fails before registering, app master 
stays aroun...")
-    '[SPARK-1032] If Yarn app fails before registering, app master stays 
aroun...'
-    >>> standardize_jira_ref(
-    ...     "[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved 
words in DDL parser.")
-    '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in 
DDL parser.'
-    >>> standardize_jira_ref(
-    ...     'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with 
F.lit"')
-    'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with F.lit"'
-    >>> standardize_jira_ref("Additional information for users building from 
source code")
-    'Additional information for users building from source code'
+
+class Component:
+    """A Spark PR-title tag, paired with its canonical JIRA component name.
+
+    ``jira_name`` is the canonical name of the SPARK JIRA component (e.g.
+    "Documentation"); empty for status markers like [MINOR] that are not
+    JIRA components but are still recognized in PR titles.
+
+    ``tag`` is the preferred PR-title abbreviation (uppercase, no brackets,
+    e.g. "DOC"). ``aliases`` lists other accepted spellings that resolve to
+    the same component (e.g. "DOCS", "DOCUMENTATION" -> "DOC").
+
+    ``primary`` marks components whose presence alone satisfies the merge-time
+    requirement. Non-primary JIRA components (e.g. [TEST], [PS], [SHUFFLE])
+    remain recognized — they normalize and pass through validation — but
+    they must be paired with a primary tag (e.g. [SQL][TEST]). Status
+    markers are never primary. [WIP] is intentionally absent from the
+    registry: a WIP PR should be aborted at the earlier WIP warning, not
+    merged.
     """
-    jira_refs = []
-    components = []
 
-    # If this is a Revert PR, no need to process any further
-    if text.startswith('Revert "') and text.endswith('"'):
-        return text
+    def __init__(self, tag, aliases=(), primary=False, jira_name=""):
+        self.tag = tag
+        self.aliases = frozenset(aliases)
+        self.primary = primary
+        self.jira_name = jira_name
 
-    # If the string is compliant, no need to process any further
-    if re.search(r"^\[SPARK-[0-9]{3,6}\](\[[A-Z0-9_\s,]+\] )+\S+", text):
-        return text
+    def matches(self, token):
+        return token == self.tag or token in self.aliases
 
-    # Extract JIRA ref(s):
-    pattern = re.compile(r"(SPARK[-\s]*[0-9]{3,6})+", re.IGNORECASE)
-    for ref in pattern.findall(text):
-        # Add brackets, replace spaces with a dash, & convert to uppercase
-        jira_refs.append("[" + re.sub(r"\s+", "-", ref.upper()) + "]")
-        text = text.replace(ref, "")
+    @classmethod
+    def find(cls, token):
+        """Return the Component matching ``token`` (case-insensitive), or 
None."""
+        if token is None:
+            return None
+        token = token.strip().upper()
+        for c in COMPONENTS:
+            if c.matches(token):
+                return c
+        return None
 
-    # Extract spark component(s):
-    # Look for alphanumeric chars, spaces, dashes, periods, and/or commas
-    pattern = re.compile(r"(\[[\w\s,.-]+\])", re.IGNORECASE)
-    for component in pattern.findall(text):
-        components.append(component.upper())
-        text = text.replace(component, "")
 
-    # Cleanup any remaining symbols:
-    pattern = re.compile(r"^\W+(.*)", re.IGNORECASE)
-    if pattern.search(text) is not None:
-        text = pattern.search(text).groups()[0]
 
-    # Assemble full text (JIRA ref(s), module(s), remaining text)
-    clean_text = "".join(jira_refs).strip() + "".join(components).strip() + " 
" + text.strip()
+# Full SPARK JIRA component list (sorted alphabetically by tag), followed
+# by status markers. Keep in sync with the components in JIRA — fetch the
+# current list with:
+#   curl -s https://issues.apache.org/jira/rest/api/2/project/SPARK/components
+# A `primary=True` marker indicates the tag alone satisfies the merge-time
+# component requirement; non-primary JIRA components must be paired with a
+# primary one (e.g. [SQL][TEST], [PYTHON][PS], [CORE][SHUFFLE]). Status
+# markers leave `jira_name` empty.
+COMPONENTS = (
+    Component("BLOCK MANAGER", ("BLOCK_MANAGER",), jira_name="Block Manager"),
+    Component("BUILD", primary=True, jira_name="Build"),
+    Component("CONNECT", primary=True, jira_name="Connect"),
+    Component("CORE", ("SPARK CORE", "SPARK_CORE"), primary=True, 
jira_name="Spark Core"),
+    Component("DEPLOY", jira_name="Deploy"),
+    Component("DOC", ("DOCS", "DOCUMENTATION"), primary=True, 
jira_name="Documentation"),
+    Component("DOCKER", primary=True, jira_name="Spark Docker"),
+    Component("EC2", jira_name="EC2"),
+    Component("EXAMPLE", ("EXAMPLES",), jira_name="Examples"),
+    Component("GRAPHX", primary=True, jira_name="GraphX"),
+    Component("INFRA", ("PROJECT INFRA", "PROJECT_INFRA"), primary=True, 
jira_name="Project Infra"),
+    Component("IO", jira_name="Input/Output"),
+    Component("JAVA", ("JAVA API", "JAVA_API", "JAVAAPI"), jira_name="Java 
API"),
+    Component("K8S", ("KUBERNETES",), primary=True, jira_name="Kubernetes"),
+    Component("MESOS", jira_name="Mesos"),
+    Component("ML", primary=True, jira_name="ML"),
+    Component("MLLIB", primary=True, jira_name="MLlib"),
+    Component("OPTIMIZER", jira_name="Optimizer"),
+    Component("PROTOBUF", jira_name="Protobuf"),
+    Component("PS", primary=True, jira_name="Pandas API on Spark"),
+    Component("PYTHON", ("PYSPARK",), primary=True, jira_name="PySpark"),
+    Component("R", ("SPARKR",), primary=True, jira_name="R"),
+    Component("REPL", ("SHELL", "SPARK SHELL", "SPARK_SHELL"), 
jira_name="Spark Shell"),
+    Component("SCHEDULER", jira_name="Scheduler"),
+    Component("SDP", ("PIPELINES",), primary=True, jira_name="Declarative 
Pipelines"),
+    Component("SECURITY", primary=True, jira_name="Security"),
+    Component("SHUFFLE", jira_name="Shuffle"),
+    # Component("SPARKR", jira_name="SparkR"),  # SPARKR is treated as an 
alias of R above
+    Component("SQL", primary=True, jira_name="SQL"),
+    Component("SS", primary=True, jira_name="Structured Streaming"),
+    Component("STREAMING", ("DSTREAM", "DSTREAMS"), primary=True, 
jira_name="DStreams"),
+    Component("SUBMIT", jira_name="Spark Submit"),
+    Component("TEST", ("TESTS", "TEST-ONLY", "TESTS-ONLY"), jira_name="Tests"),
+    Component("UI", ("WEBUI", "WEB UI", "WEB_UI"), primary=True, 
jira_name="Web UI"),
+    Component("WINDOWS", primary=True, jira_name="Windows"),
+    Component("YARN", primary=True, jira_name="YARN"),
+    # Status markers — recognized in PR titles, but not JIRA components.
+    Component("FOLLOWUP", ("FOLLOW-UP", "FOLLOW UP")),
+    Component("MINOR"),
+    Component("TRIVIAL"),
+)
+
+
+_BRACKET_TAG_RE = re.compile(r"\[\s*([A-Za-z0-9._-]+)\s*\]")
+_SPARK_ID_RE = re.compile(r"^SPARK-\d+$", re.IGNORECASE)
+_VERSION_TAG_RE = re.compile(r"^\d+\.(\d+|X)$")
+_LEADING_TAGS = frozenset({"MINOR", "TRIVIAL"})
+
+
+class Title:

Review Comment:
   Added — `Title.parse("[SPARK-1234]")` doctest now covers both the empty-body 
parse and the `__str__` 'no text' branch.



##########
dev/merge_spark_pr.py:
##########
@@ -700,77 +701,275 @@ def resolve_jira_issues(title, merge_branches, comment):
         resolve_jira_issue(merge_branches, comment, jira_id)
 
 
-def standardize_jira_ref(text):
-    """
-    Standardize the [SPARK-XXXXX] [MODULE] prefix
-    Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK 
XXX [MLLIB]: Issue" to
-    "[SPARK-XXX][MLLIB] Issue"
-
-    >>> standardize_jira_ref(
-    ...     "[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete 
is successful")
-    '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is 
successful'
-    >>> standardize_jira_ref(
-    ...     "[SPARK-4123][Project Infra][WIP]: Show new dependencies added in 
pull requests")
-    '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull 
requests'
-    >>> standardize_jira_ref("[MLlib] Spark  5954: Top by key")
-    '[SPARK-5954][MLLIB] Top by key'
-    >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing 
in TaskSchedulerImpl")
-    '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl'
-    >>> standardize_jira_ref(
-    ...     "SPARK-1094 Support MiMa for reporting binary compatibility across 
versions.")
-    '[SPARK-1094] Support MiMa for reporting binary compatibility across 
versions.'
-    >>> standardize_jira_ref("[WIP]  [SPARK-1146] Vagrant support for Spark")
-    '[SPARK-1146][WIP] Vagrant support for Spark'
-    >>> standardize_jira_ref(
-    ...     "SPARK-1032. If Yarn app fails before registering, app master 
stays aroun...")
-    '[SPARK-1032] If Yarn app fails before registering, app master stays 
aroun...'
-    >>> standardize_jira_ref(
-    ...     "[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved 
words in DDL parser.")
-    '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in 
DDL parser.'
-    >>> standardize_jira_ref(
-    ...     'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with 
F.lit"')
-    'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with F.lit"'
-    >>> standardize_jira_ref("Additional information for users building from 
source code")
-    'Additional information for users building from source code'
+
+class Component:
+    """A Spark PR-title tag, paired with its canonical JIRA component name.
+
+    ``jira_name`` is the canonical name of the SPARK JIRA component (e.g.
+    "Documentation"); empty for status markers like [MINOR] that are not
+    JIRA components but are still recognized in PR titles.
+
+    ``tag`` is the preferred PR-title abbreviation (uppercase, no brackets,
+    e.g. "DOC"). ``aliases`` lists other accepted spellings that resolve to
+    the same component (e.g. "DOCS", "DOCUMENTATION" -> "DOC").
+
+    ``primary`` marks components whose presence alone satisfies the merge-time
+    requirement. Non-primary JIRA components (e.g. [TEST], [PS], [SHUFFLE])
+    remain recognized — they normalize and pass through validation — but
+    they must be paired with a primary tag (e.g. [SQL][TEST]). Status
+    markers are never primary. [WIP] is intentionally absent from the
+    registry: a WIP PR should be aborted at the earlier WIP warning, not
+    merged.
     """
-    jira_refs = []
-    components = []
 
-    # If this is a Revert PR, no need to process any further
-    if text.startswith('Revert "') and text.endswith('"'):
-        return text
+    def __init__(self, tag, aliases=(), primary=False, jira_name=""):
+        self.tag = tag
+        self.aliases = frozenset(aliases)
+        self.primary = primary
+        self.jira_name = jira_name
 
-    # If the string is compliant, no need to process any further
-    if re.search(r"^\[SPARK-[0-9]{3,6}\](\[[A-Z0-9_\s,]+\] )+\S+", text):
-        return text
+    def matches(self, token):
+        return token == self.tag or token in self.aliases
 
-    # Extract JIRA ref(s):
-    pattern = re.compile(r"(SPARK[-\s]*[0-9]{3,6})+", re.IGNORECASE)
-    for ref in pattern.findall(text):
-        # Add brackets, replace spaces with a dash, & convert to uppercase
-        jira_refs.append("[" + re.sub(r"\s+", "-", ref.upper()) + "]")
-        text = text.replace(ref, "")
+    @classmethod
+    def find(cls, token):
+        """Return the Component matching ``token`` (case-insensitive), or 
None."""
+        if token is None:
+            return None
+        token = token.strip().upper()
+        for c in COMPONENTS:
+            if c.matches(token):
+                return c
+        return None
 
-    # Extract spark component(s):
-    # Look for alphanumeric chars, spaces, dashes, periods, and/or commas
-    pattern = re.compile(r"(\[[\w\s,.-]+\])", re.IGNORECASE)
-    for component in pattern.findall(text):
-        components.append(component.upper())
-        text = text.replace(component, "")
 
-    # Cleanup any remaining symbols:
-    pattern = re.compile(r"^\W+(.*)", re.IGNORECASE)
-    if pattern.search(text) is not None:
-        text = pattern.search(text).groups()[0]
 
-    # Assemble full text (JIRA ref(s), module(s), remaining text)
-    clean_text = "".join(jira_refs).strip() + "".join(components).strip() + " 
" + text.strip()
+# Full SPARK JIRA component list (sorted alphabetically by tag), followed
+# by status markers. Keep in sync with the components in JIRA — fetch the
+# current list with:
+#   curl -s https://issues.apache.org/jira/rest/api/2/project/SPARK/components
+# A `primary=True` marker indicates the tag alone satisfies the merge-time
+# component requirement; non-primary JIRA components must be paired with a
+# primary one (e.g. [SQL][TEST], [PYTHON][PS], [CORE][SHUFFLE]). Status
+# markers leave `jira_name` empty.
+COMPONENTS = (
+    Component("BLOCK MANAGER", ("BLOCK_MANAGER",), jira_name="Block Manager"),
+    Component("BUILD", primary=True, jira_name="Build"),
+    Component("CONNECT", primary=True, jira_name="Connect"),
+    Component("CORE", ("SPARK CORE", "SPARK_CORE"), primary=True, 
jira_name="Spark Core"),
+    Component("DEPLOY", jira_name="Deploy"),
+    Component("DOC", ("DOCS", "DOCUMENTATION"), primary=True, 
jira_name="Documentation"),
+    Component("DOCKER", primary=True, jira_name="Spark Docker"),
+    Component("EC2", jira_name="EC2"),
+    Component("EXAMPLE", ("EXAMPLES",), jira_name="Examples"),
+    Component("GRAPHX", primary=True, jira_name="GraphX"),
+    Component("INFRA", ("PROJECT INFRA", "PROJECT_INFRA"), primary=True, 
jira_name="Project Infra"),
+    Component("IO", jira_name="Input/Output"),
+    Component("JAVA", ("JAVA API", "JAVA_API", "JAVAAPI"), jira_name="Java 
API"),
+    Component("K8S", ("KUBERNETES",), primary=True, jira_name="Kubernetes"),
+    Component("MESOS", jira_name="Mesos"),
+    Component("ML", primary=True, jira_name="ML"),
+    Component("MLLIB", primary=True, jira_name="MLlib"),
+    Component("OPTIMIZER", jira_name="Optimizer"),
+    Component("PROTOBUF", jira_name="Protobuf"),
+    Component("PS", primary=True, jira_name="Pandas API on Spark"),
+    Component("PYTHON", ("PYSPARK",), primary=True, jira_name="PySpark"),
+    Component("R", ("SPARKR",), primary=True, jira_name="R"),
+    Component("REPL", ("SHELL", "SPARK SHELL", "SPARK_SHELL"), 
jira_name="Spark Shell"),
+    Component("SCHEDULER", jira_name="Scheduler"),
+    Component("SDP", ("PIPELINES",), primary=True, jira_name="Declarative 
Pipelines"),
+    Component("SECURITY", primary=True, jira_name="Security"),
+    Component("SHUFFLE", jira_name="Shuffle"),
+    # Component("SPARKR", jira_name="SparkR"),  # SPARKR is treated as an 
alias of R above
+    Component("SQL", primary=True, jira_name="SQL"),
+    Component("SS", primary=True, jira_name="Structured Streaming"),
+    Component("STREAMING", ("DSTREAM", "DSTREAMS"), primary=True, 
jira_name="DStreams"),
+    Component("SUBMIT", jira_name="Spark Submit"),
+    Component("TEST", ("TESTS", "TEST-ONLY", "TESTS-ONLY"), jira_name="Tests"),
+    Component("UI", ("WEBUI", "WEB UI", "WEB_UI"), primary=True, 
jira_name="Web UI"),
+    Component("WINDOWS", primary=True, jira_name="Windows"),
+    Component("YARN", primary=True, jira_name="YARN"),
+    # Status markers — recognized in PR titles, but not JIRA components.
+    Component("FOLLOWUP", ("FOLLOW-UP", "FOLLOW UP")),
+    Component("MINOR"),
+    Component("TRIVIAL"),
+)
+
+
+_BRACKET_TAG_RE = re.compile(r"\[\s*([A-Za-z0-9._-]+)\s*\]")
+_SPARK_ID_RE = re.compile(r"^SPARK-\d+$", re.IGNORECASE)
+_VERSION_TAG_RE = re.compile(r"^\d+\.(\d+|X)$")
+_LEADING_TAGS = frozenset({"MINOR", "TRIVIAL"})
+
+
+class Title:
+    """Structured PR title: SPARK refs, component tags, and body.
+
+    ``leading``    — SPARK-NNNNN IDs and [MINOR]/[TRIVIAL] markers, in order.
+    ``components`` — all other bracket tags, in order.
+    ``text``       — body text following the bracket sequence.
+
+    >>> t = Title.parse("[SPARK-1234][SQL] Fix something")
+    >>> t.leading, t.components, t.text
+    (['SPARK-1234'], ['SQL'], 'Fix something')
+    >>> str(t)
+    '[SPARK-1234][SQL] Fix something'
+    >>> t = Title.parse("[SPARK-1234][SQL][FOLLOWUP] Fix something")
+    >>> t.leading, t.components, t.text
+    (['SPARK-1234'], ['SQL', 'FOLLOWUP'], 'Fix something')
+    >>> str(t)
+    '[SPARK-1234][SQL][FOLLOWUP] Fix something'
+    """
 
-    # Replace multiple spaces with a single space, e.g. if no jira refs and/or 
components were
-    # included
-    clean_text = re.sub(r"\s+", " ", clean_text.strip())
+    def __init__(
+        self,
+        leading: List[str],
+        components: List[str],
+        text: str,
+    ) -> None:
+        self.leading = leading
+        self.components = components
+        self.text = text
+
+    @classmethod
+    def parse(cls, raw: str) -> "Title":
+        """Parse a PR title string into a :class:`Title`.
+
+        A title must open with a leading tag ([SPARK-NNNNN], [MINOR], or
+        [TRIVIAL]); otherwise :exc:`ValueError` is raised.  Subsequent bracket
+        tokens (spaces trimmed, separated by optional whitespace) go to
+        ``components``.  The remainder is ``text``.
+
+        >>> t = Title.parse("[SPARK-1234][SQL][TESTS] Fix something")
+        >>> t.leading, t.components, t.text
+        (['SPARK-1234'], ['SQL', 'TESTS'], 'Fix something')
+        >>> t = Title.parse("  [ SPARK-1234 ]  [ SQL ] [  TESTS  ]   Fix 
something")
+        >>> t.leading, t.components, t.text
+        (['SPARK-1234'], ['SQL', 'TESTS'], 'Fix something')
+        >>> t = Title.parse("[SPARK-1234 ][ sql ][ followup ] Fix")
+        >>> t.leading, t.components, t.text
+        (['SPARK-1234'], ['SQL', 'FOLLOWUP'], 'Fix')
+        >>> str(t)
+        '[SPARK-1234][SQL][FOLLOWUP] Fix'
+        >>> Title.parse("[MINOR] Fix typo").leading
+        ['MINOR']
+        >>> t = Title.parse("[spark-1234][sql][followup] Fix")
+        >>> t.leading, t.components
+        (['SPARK-1234'], ['SQL', 'FOLLOWUP'])
+        >>> Title.parse("[SPARK-1234][SPARK-5678][SQL] Fix").leading
+        ['SPARK-1234', 'SPARK-5678']
+        >>> Title.parse("[SPARK-1234][4.X][SQL] Fix").components
+        ['4.X', 'SQL']
+        >>> Title.parse("[SPARK-1234][SQL][4.2] Fix").components
+        ['SQL', '4.2']
+        >>> Title.parse("[SQL] Fix")
+        Traceback (most recent call last):
+            ...
+        ValueError: title must start with [SPARK-NNNNN], [MINOR], or 
[TRIVIAL]: '[SQL] Fix'
+        >>> Title.parse("No brackets")
+        Traceback (most recent call last):
+            ...
+        ValueError: title must start with [SPARK-NNNNN], [MINOR], or 
[TRIVIAL]: 'No brackets'
+        >>> Title.parse("[SPARK-1234][SQL][SPARK-123] Fix")
+        Traceback (most recent call last):
+            ...
+        ValueError: [SPARK-NNNNN] tags must all appear before other tags: 
'[SPARK-1234][SQL][SPARK-123] Fix'
+        >>> Title.parse("[SPARK-1234][MINOR][SQL] Fix")
+        Traceback (most recent call last):
+            ...
+        ValueError: [SPARK-NNNNN], [MINOR], and [TRIVIAL] cannot coexist
+        >>> Title.parse("[MINOR][TRIVIAL][SQL] Fix")
+        Traceback (most recent call last):
+            ...
+        ValueError: [SPARK-NNNNN], [MINOR], and [TRIVIAL] cannot coexist
+        """
+        leading: List[str] = []
+        components: List[str] = []
+
+        raw = raw.strip()
+        m0 = _BRACKET_TAG_RE.match(raw)
+        first = m0.group(1).upper() if m0 else ""
+        if not (_SPARK_ID_RE.match(first) or first in _LEADING_TAGS):
+            raise ValueError(
+                "title must start with [SPARK-NNNNN], [MINOR], or [TRIVIAL]: 
%r" % raw
+            )
 
-    return clean_text
+        past_leading = False
+        pos = 0
+        while pos < len(raw):
+            m = _BRACKET_TAG_RE.match(raw, pos)
+            if not m:
+                break
+            tag = m.group(1).upper()
+            if _SPARK_ID_RE.match(tag):
+                if past_leading:
+                    raise ValueError(
+                        "[SPARK-NNNNN] tags must all appear before other tags: 
%r" % raw
+                    )
+                leading.append(tag)
+            elif tag in _LEADING_TAGS:
+                leading.append(tag)
+            else:
+                components.append(tag)
+                past_leading = True
+            pos = m.end()
+            while pos < len(raw) and raw[pos] == " ":
+                pos += 1
+
+        text = raw[pos:].lstrip()
+        markers = [t for t in leading if t in _LEADING_TAGS]
+        if len(markers) > 1 or (markers and len(leading) > len(markers)):
+            raise ValueError(
+                "[SPARK-NNNNN], [MINOR], and [TRIVIAL] cannot coexist"
+            )
+        return cls(leading, components, text)
+
+    def __str__(self) -> str:
+        parts = "".join("[%s]" % t for t in self.leading)
+        parts += "".join("[%s]" % c for c in self.components)
+        if not self.text:
+            return parts
+        return parts + (" " if parts else "") + self.text
+
+
+def prompt_for_components():
+    """
+    Prompt the committer for component(s) when the PR title lacks a primary
+    component. Each entered token is normalized via Component.find
+    (e.g. "DOCS" -> "DOC", "PYSPARK" -> "PYTHON"). Unrecognized tokens are
+    passed through as-is. Re-prompts until at least one entered token resolves
+    to a tag in PRIMARY_COMPONENTS. Returns an uppercase list of tags in

Review Comment:
   Done — replaced with 'a primary Component (one with primary=True)'.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-56979][INFRA] Require COMPONENT tag in PR title at merge time [spark]

Reply via email to