(texera) 01/01: feat(huggingface): add qa and ranking tasks (#5574)

github-bot Tue, 23 Jun 2026 15:51:31 -0700

This is an automated email from the ASF dual-hosted git repository.

github-merge-queue[bot] pushed a commit to branch 
gh-readonly-queue/main/pr-5574-6182c6c81df93fabf9ea22eb1350db1642ce9bb0
in repository https://gitbox.apache.org/repos/asf/texera.git


commit c3161f79e26a37bd5ac0190603d50ec875d6548e
Author: Anish Shivamurthy <[email protected]>
AuthorDate: Tue Jun 23 15:50:55 2026 -0700

    feat(huggingface): add qa and ranking tasks (#5574)
    
    ## What changes were proposed in this PR?
    
    Adds the QA/ranking/classification task family — 5 HF pipeline tasks —
    as a new `TaskCodegen` plugged into the dispatcher established by the
    text-generation PR:
    
    QA tasks: `question-answering`, `table-question-answering`
    
    classification/ranking tasks: `zero-shot-classification`,
    `sentence-similarity`, `text-ranking`
    
    `codegen/QaRankingCodegen.scala` supplies the per-task payload + parse
    Python branches for all 5 tasks.
    
    `CodegenContext` is extended with `contextColumn`, `candidateLabels`,
    and `sentencesColumn` (`EncodableString`).
    
    `HuggingFaceInferenceOpDesc.scala` gains 3 new `@JsonProperty` fields
    and registers `QaRankingCodegen` in the dispatcher.
    
    `PythonCodegenBase.scala` grows to host the shared QA/ranking
    infrastructure:
    
    - Per-row validation for the new column-named fields.
    - `question-answering` payload handling with prompt + context.
    - `table-question-answering` payload handling with table data.
    - `zero-shot-classification` payload handling with candidate labels.
    - `sentence-similarity` and `text-ranking` payload handling with
    sentence inputs.
    - Response parsing for QA/ranking outputs.
    
    User-input strings continue to flow through `pyb"..."` +
    `EncodableString` so they reach Python as
    `self.decode_python_template('<base64>')` rather than raw literals.
    `PythonCodeRawInvalidTextSpec` still passes with 117/117 descriptors
    py_compile cleanly.
    
    ## Any related issues, documentation, or discussions?
    
    Tracking issue: Add HuggingFace question answering and ranking tasks
    #5292
    
    Closes #5292
    
    Stacked on: PR 4 audio/media generation tasks / `hf/04-audio-mediagen`
    
    Parent issue: Add Hugging Face inference operator #5041
    
    Closed sibling issue: Add HuggingFaceModelResource REST endpoints for HF
    operator UI #5134
    
    ## How was this PR tested?
    
    `sbt "WorkflowOperator/compile; WorkflowOperator/Test/compile"` clean.
    
    `sbt "WorkflowOperator/testOnly
    org.apache.texera.amber.operator.huggingFace.HuggingFaceInferenceOpDescSpec
    org.apache.texera.amber.util.PythonCodeRawInvalidTextSpec"` — 31 focused
    tests pass, including HuggingFace QA/ranking task coverage and the raw
    Python descriptor scan.
    
    `sbt "WorkflowOperator / scalafmtCheck"` clean.
    
    `sbt "WorkflowOperator / Test / scalafmtCheck"` clean.
    
    `PythonCodeRawInvalidTextSpec` — 117/117 descriptors py_compile cleanly
    with the new operator code paths, no marker leaks.
    
    ## Was this PR authored or co-authored using generative AI tooling?
    
    Yes, co-authored with generative AI tooling (Codex).
---
 .../huggingFace/HuggingFaceInferenceOpDesc.scala   | 32 ++++++++-
 .../huggingFace/codegen/PythonCodegenBase.scala    | 34 +++++++++
 .../huggingFace/codegen/QaRankingCodegen.scala     | 84 ++++++++++++++++++++++
 .../operator/huggingFace/codegen/TaskCodegen.scala |  5 +-
 .../HuggingFaceInferenceOpDescSpec.scala           | 83 ++++++++++++++++++++-
 5 files changed, 235 insertions(+), 3 deletions(-)

diff --git 
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDesc.scala
 
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDesc.scala
index f7805266cf..0e1062c75a 100644
--- 
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDesc.scala
+++ 
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDesc.scala
@@ -30,6 +30,7 @@ import org.apache.texera.amber.operator.huggingFace.codegen.{
   ImageTaskCodegen,
   MediaGenCodegen,
   PythonCodegenBase,
+  QaRankingCodegen,
   TaskCodegen,
   TextGenCodegen
 }
@@ -108,6 +109,25 @@ class HuggingFaceInferenceOpDesc extends 
PythonOperatorDescriptor {
   @AutofillAttributeName
   var inputAudioColumn: EncodableString = ""
 
+  @JsonProperty(value = "contextColumn", required = false)
+  @JsonSchemaTitle("Context Column")
+  @JsonPropertyDescription("Column containing the context passage for question 
answering")
+  @AutofillAttributeName
+  var contextColumn: EncodableString = ""
+
+  @JsonProperty(value = "candidateLabels", required = false)
+  @JsonSchemaTitle("Candidate Labels")
+  @JsonPropertyDescription("Comma-separated candidate labels for zero-shot 
classification")
+  var candidateLabels: EncodableString = ""
+
+  @JsonProperty(value = "sentencesColumn", required = false)
+  @JsonSchemaTitle("Sentences Column")
+  @JsonPropertyDescription(
+    "Column with comma-separated sentences for sentence similarity and text 
ranking"
+  )
+  @AutofillAttributeName
+  var sentencesColumn: EncodableString = ""
+
   @JsonProperty(
     value = "systemPrompt",
     required = false,
@@ -153,6 +173,7 @@ class HuggingFaceInferenceOpDesc extends 
PythonOperatorDescriptor {
     ImageTaskCodegen.tasks.foreach(t => byTask += (t -> ImageTaskCodegen))
     AudioTaskCodegen.tasks.foreach(t => byTask += (t -> AudioTaskCodegen))
     MediaGenCodegen.tasks.foreach(t => byTask += (t -> MediaGenCodegen))
+    QaRankingCodegen.tasks.foreach(t => byTask += (t -> QaRankingCodegen))
     byTask.toMap
   }
 
@@ -200,6 +221,12 @@ class HuggingFaceInferenceOpDesc extends 
PythonOperatorDescriptor {
       if (audioInput == null) "" else audioInput
     val safeInputAudioColumn: EncodableString =
       if (inputAudioColumn == null) "" else inputAudioColumn
+    val safeContextColumn: EncodableString =
+      if (contextColumn == null) "" else contextColumn
+    val safeCandidateLabels: EncodableString =
+      if (candidateLabels == null) "" else candidateLabels
+    val safeSentencesColumn: EncodableString =
+      if (sentencesColumn == null) "" else sentencesColumn
 
     val ctx = CodegenContext(
       hfApiToken = safeToken,
@@ -213,7 +240,10 @@ class HuggingFaceInferenceOpDesc extends 
PythonOperatorDescriptor {
       imageInput = safeImageInput,
       inputImageColumn = safeInputImageColumn,
       audioInput = safeAudioInput,
-      inputAudioColumn = safeInputAudioColumn
+      inputAudioColumn = safeInputAudioColumn,
+      contextColumn = safeContextColumn,
+      candidateLabels = safeCandidateLabels,
+      sentencesColumn = safeSentencesColumn
     )
 
     PythonCodegenBase.render(ctx, codegenForTask(safeTask))
diff --git 
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala
 
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala
index 8671b9a76a..7cd305bfb6 100644
--- 
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala
+++ 
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala
@@ -59,6 +59,9 @@ object PythonCodegenBase {
     val inputImageColumn = ctx.inputImageColumn
     val audioInput = ctx.audioInput
     val inputAudioColumn = ctx.inputAudioColumn
+    val contextColumn = ctx.contextColumn
+    val candidateLabels = ctx.candidateLabels
+    val sentencesColumn = ctx.sentencesColumn
     pyb"""import os
        |import re
        |import json
@@ -141,6 +144,9 @@ object PythonCodegenBase {
        |        self.INPUT_IMAGE_COLUMN = $inputImageColumn
        |        self.AUDIO_INPUT = $audioInput
        |        self.INPUT_AUDIO_COLUMN = $inputAudioColumn
+       |        self.CONTEXT_COLUMN = $contextColumn
+       |        self.CANDIDATE_LABELS = $candidateLabels
+       |        self.SENTENCES_COLUMN = $sentencesColumn
        |
        |    def _resolve_providers(self, token):
        |        '''Query the HF Hub API for inference providers serving this 
model.
@@ -491,6 +497,24 @@ object PythonCodegenBase {
        |                f"Prompt column '{prompt_col}' not found in input 
table. "
        |                f"Available columns: {list(table.columns)}"
        |            )
+       |        if task == "zero-shot-classification":
+       |            labels = [l.strip() for l in 
str(self.CANDIDATE_LABELS).split(",") if l.strip()]
+       |            assert labels, (
+       |                "Candidate Labels are required for 
zero-shot-classification. "
+       |                "Provide a comma-separated list of labels."
+       |            )
+       |        if task == "question-answering":
+       |            ctx_col = self.CONTEXT_COLUMN
+       |            assert ctx_col and ctx_col in table.columns, (
+       |                f"Context column '{ctx_col}' not found in input table. 
"
+       |                f"Available columns: {list(table.columns)}"
+       |            )
+       |        if task in ("sentence-similarity", "text-ranking"):
+       |            sent_col = self.SENTENCES_COLUMN
+       |            assert sent_col and sent_col in table.columns, (
+       |                f"Sentences column '{sent_col}' not found in input 
table. "
+       |                f"Available columns: {list(table.columns)}"
+       |            )
        |
        |        # --- handle empty table ---
        |        if table.empty:
@@ -506,6 +530,16 @@ object PythonCodegenBase {
        |            "Authorization": f"Bearer {token}",
        |            "Content-Type": "application/octet-stream",
        |        }
+       |        # --- pre-compute table dict for table-question-answering ---
+       |        table_dict = None
+       |        if task == "table-question-answering":
+       |            table_dict = {}
+       |            for col in table.columns:
+       |                if col != prompt_col and col != result_col:
+       |                    table_dict[col] = [
+       |                        str(v) if not pd.isna(v) else "" for v in 
table[col].tolist()
+       |                    ]
+       |
        |        # --- resolve image source (upload or column) for image tasks 
---
        |        has_image_upload = bool(self.IMAGE_INPUT) and 
bool(str(self.IMAGE_INPUT).strip())
        |        use_image_column = not has_image_upload and 
bool(self.INPUT_IMAGE_COLUMN) and self.INPUT_IMAGE_COLUMN in table.columns
diff --git 
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/QaRankingCodegen.scala
 
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/QaRankingCodegen.scala
new file mode 100644
index 0000000000..18c6897499
--- /dev/null
+++ 
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/QaRankingCodegen.scala
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.huggingFace.codegen
+
+/**
+  * Codegen for question-answering, zero-shot, similarity, and ranking tasks.
+  *
+  * These tasks are prompt-driven but need extra per-row or per-operator
+  * inputs: context text, candidate labels, table contents, or a list of
+  * comparison sentences/documents.
+  */
+object QaRankingCodegen extends TaskCodegen {
+
+  override val task: String = "question-answering"
+
+  override val tasks: Set[String] = Set(
+    "question-answering",
+    "table-question-answering",
+    "zero-shot-classification",
+    "sentence-similarity",
+    "text-ranking"
+  )
+
+  override def payloadPython(ctx: CodegenContext): String =
+    """            if task == "question-answering":
+      |                ctx_val = row[self.CONTEXT_COLUMN]
+      |                ctx_val = "" if pd.isna(ctx_val) else str(ctx_val)
+      |                payload = {"inputs": {"question": prompt_value, 
"context": ctx_val}}
+      |            elif task == "table-question-answering":
+      |                payload = {"inputs": {"query": prompt_value, "table": 
table_dict}}
+      |            elif task == "zero-shot-classification":
+      |                labels = [l.strip() for l in 
str(self.CANDIDATE_LABELS).split(",") if l.strip()]
+      |                payload = {
+      |                    "inputs": prompt_value,
+      |                    "parameters": {"candidate_labels": labels},
+      |                }
+      |            elif task == "sentence-similarity":
+      |                sent_val = row[self.SENTENCES_COLUMN]
+      |                sent_val = "" if pd.isna(sent_val) else str(sent_val)
+      |                sentences_list = [s.strip() for s in 
sent_val.split(",") if s.strip()]
+      |                payload = {
+      |                    "inputs": {
+      |                        "source_sentence": prompt_value,
+      |                        "sentences": sentences_list,
+      |                    }
+      |                }
+      |            elif task == "text-ranking":
+      |                sent_val = row[self.SENTENCES_COLUMN]
+      |                sent_val = "" if pd.isna(sent_val) else str(sent_val)
+      |                sentences_list = [s.strip() for s in 
sent_val.split(",") if s.strip()]
+      |                payload = {
+      |                    "inputs": {
+      |                        "query": prompt_value,
+      |                        "texts": sentences_list,
+      |                    }
+      |                }
+      |            else:
+      |                payload = {"inputs": prompt_value}""".stripMargin
+
+  override def parsePython(ctx: CodegenContext): String =
+    """            if task == "question-answering":
+      |                return body.get("answer", json.dumps(body)) if 
isinstance(body, dict) else json.dumps(body)
+      |            elif task == "table-question-answering":
+      |                return body.get("answer", json.dumps(body)) if 
isinstance(body, dict) else json.dumps(body)
+      |            elif task in ("zero-shot-classification", 
"sentence-similarity", "text-ranking"):
+      |                return json.dumps(body)""".stripMargin
+}
diff --git 
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/TaskCodegen.scala
 
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/TaskCodegen.scala
index 80bbcc58fc..8abcef721b 100644
--- 
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/TaskCodegen.scala
+++ 
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/TaskCodegen.scala
@@ -41,7 +41,10 @@ final case class CodegenContext(
     imageInput: EncodableString = "",
     inputImageColumn: EncodableString = "",
     audioInput: EncodableString = "",
-    inputAudioColumn: EncodableString = ""
+    inputAudioColumn: EncodableString = "",
+    contextColumn: EncodableString = "",
+    candidateLabels: EncodableString = "",
+    sentencesColumn: EncodableString = ""
 )
 
 /**
diff --git 
a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDescSpec.scala
 
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDescSpec.scala
index eb728945f3..83f6239903 100644
--- 
a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDescSpec.scala
+++ 
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDescSpec.scala
@@ -25,6 +25,7 @@ import org.apache.texera.amber.operator.huggingFace.codegen.{
   AudioTaskCodegen,
   CodegenContext,
   MediaGenCodegen,
+  QaRankingCodegen,
   TextGenCodegen
 }
 import org.apache.texera.amber.operator.metadata.OperatorGroupConstants
@@ -46,7 +47,10 @@ class HuggingFaceInferenceOpDescSpec extends AnyFlatSpec 
with Matchers {
       imageInput: EncodableString = "",
       inputImageColumn: EncodableString = "",
       audioInput: EncodableString = "",
-      inputAudioColumn: EncodableString = ""
+      inputAudioColumn: EncodableString = "",
+      contextColumn: EncodableString = "",
+      candidateLabels: EncodableString = "",
+      sentencesColumn: EncodableString = ""
   ): HuggingFaceInferenceOpDesc = {
     val desc = new HuggingFaceInferenceOpDesc()
     desc.hfApiToken = token
@@ -61,6 +65,9 @@ class HuggingFaceInferenceOpDescSpec extends AnyFlatSpec with 
Matchers {
     desc.inputImageColumn = inputImageColumn
     desc.audioInput = audioInput
     desc.inputAudioColumn = inputAudioColumn
+    desc.contextColumn = contextColumn
+    desc.candidateLabels = candidateLabels
+    desc.sentencesColumn = sentencesColumn
     desc
   }
 
@@ -163,6 +170,9 @@ class HuggingFaceInferenceOpDescSpec extends AnyFlatSpec 
with Matchers {
     desc.inputImageColumn = null
     desc.audioInput = null
     desc.inputAudioColumn = null
+    desc.contextColumn = null
+    desc.candidateLabels = null
+    desc.sentencesColumn = null
     val code = desc.generatePythonCode()
     code should include("class ProcessTableOperator(UDFTableOperator):")
     code should include("def open(self):")
@@ -497,6 +507,77 @@ class HuggingFaceInferenceOpDescSpec extends AnyFlatSpec 
with Matchers {
     }
   }
 
+  "qa and ranking task family" should
+    "route question-answering through QaRankingCodegen with context-column 
validation" in {
+    val code = makeDesc(task = "question-answering", contextColumn = 
"context").generatePythonCode()
+    code should include("self.CONTEXT_COLUMN = ")
+    code should include("""if task == "question-answering":""")
+    code should include("ctx_col = self.CONTEXT_COLUMN")
+    code should include("Context column")
+    code should include("""payload = {"inputs": {"question": prompt_value, 
"context": ctx_val}}""")
+    code should include(
+      """return body.get("answer", json.dumps(body)) if isinstance(body, dict) 
else json.dumps(body)"""
+    )
+  }
+
+  it should "route table-question-answering with a precomputed table payload" 
in {
+    val code = makeDesc(task = "table-question-answering").generatePythonCode()
+    code should include("""if task == "table-question-answering":""")
+    code should include("table_dict = {}")
+    code should include("""payload = {"inputs": {"query": prompt_value, 
"table": table_dict}}""")
+    code should include(
+      """return body.get("answer", json.dumps(body)) if isinstance(body, dict) 
else json.dumps(body)"""
+    )
+  }
+
+  it should "route zero-shot-classification with candidate labels" in {
+    val code =
+      makeDesc(task = "zero-shot-classification", candidateLabels = 
"positive,negative")
+        .generatePythonCode()
+    code should include("self.CANDIDATE_LABELS = ")
+    code should include("""if task == "zero-shot-classification":""")
+    code should include(
+      "labels = [l.strip() for l in str(self.CANDIDATE_LABELS).split"
+    )
+    code should include("Candidate Labels are required for 
zero-shot-classification.")
+    code should include("""elif task == "zero-shot-classification":""")
+    code should include("labels = [l.strip() for l in 
str(self.CANDIDATE_LABELS).split")
+    code should include(""""parameters": {"candidate_labels": labels}""")
+  }
+
+  it should "route sentence-similarity and text-ranking with sentences-column 
validation" in {
+    Seq("sentence-similarity", "text-ranking").foreach { taskName =>
+      val code = makeDesc(task = taskName, sentencesColumn = 
"sentences").generatePythonCode()
+      code should include("self.SENTENCES_COLUMN = ")
+      code should include("sent_col = self.SENTENCES_COLUMN")
+      code should include("Sentences column")
+      if (taskName == "sentence-similarity") {
+        code should include("""elif task == "sentence-similarity":""")
+        code should include(""""source_sentence": prompt_value""")
+        code should include(""""sentences": sentences_list""")
+      } else {
+        code should include("""elif task == "text-ranking":""")
+        code should include(""""query": prompt_value""")
+        code should include(""""texts": sentences_list""")
+      }
+    }
+  }
+
+  it should "register all qa and ranking task strings under the dispatcher" in 
{
+    QaRankingCodegen.tasks should contain allOf (
+      "question-answering",
+      "table-question-answering",
+      "zero-shot-classification",
+      "sentence-similarity",
+      "text-ranking"
+    )
+    QaRankingCodegen.tasks.foreach { t =>
+      val code = makeDesc(task = t, contextColumn = "context", sentencesColumn 
= "sentences")
+        .generatePythonCode()
+      code should include("""if task == "question-answering":""")
+    }
+  }
+
   "getOutputSchemas" should "add the result column as a STRING to the 
inherited schema" in {
     val desc = makeDesc(resultColumn = "answer")
     val inputSchema = Schema().add("prompt", AttributeType.STRING)

(texera) 01/01: feat(huggingface): add qa and ranking tasks (#5574)

Reply via email to