This is an automated email from the ASF dual-hosted git repository. github-merge-queue[bot] pushed a commit to branch gh-readonly-queue/main/pr-5574-6182c6c81df93fabf9ea22eb1350db1642ce9bb0 in repository https://gitbox.apache.org/repos/asf/texera.git
commit c3161f79e26a37bd5ac0190603d50ec875d6548e Author: Anish Shivamurthy <[email protected]> AuthorDate: Tue Jun 23 15:50:55 2026 -0700 feat(huggingface): add qa and ranking tasks (#5574) ## What changes were proposed in this PR? Adds the QA/ranking/classification task family — 5 HF pipeline tasks — as a new `TaskCodegen` plugged into the dispatcher established by the text-generation PR: QA tasks: `question-answering`, `table-question-answering` classification/ranking tasks: `zero-shot-classification`, `sentence-similarity`, `text-ranking` `codegen/QaRankingCodegen.scala` supplies the per-task payload + parse Python branches for all 5 tasks. `CodegenContext` is extended with `contextColumn`, `candidateLabels`, and `sentencesColumn` (`EncodableString`). `HuggingFaceInferenceOpDesc.scala` gains 3 new `@JsonProperty` fields and registers `QaRankingCodegen` in the dispatcher. `PythonCodegenBase.scala` grows to host the shared QA/ranking infrastructure: - Per-row validation for the new column-named fields. - `question-answering` payload handling with prompt + context. - `table-question-answering` payload handling with table data. - `zero-shot-classification` payload handling with candidate labels. - `sentence-similarity` and `text-ranking` payload handling with sentence inputs. - Response parsing for QA/ranking outputs. User-input strings continue to flow through `pyb"..."` + `EncodableString` so they reach Python as `self.decode_python_template('<base64>')` rather than raw literals. `PythonCodeRawInvalidTextSpec` still passes with 117/117 descriptors py_compile cleanly. ## Any related issues, documentation, or discussions? Tracking issue: Add HuggingFace question answering and ranking tasks #5292 Closes #5292 Stacked on: PR 4 audio/media generation tasks / `hf/04-audio-mediagen` Parent issue: Add Hugging Face inference operator #5041 Closed sibling issue: Add HuggingFaceModelResource REST endpoints for HF operator UI #5134 ## How was this PR tested? `sbt "WorkflowOperator/compile; WorkflowOperator/Test/compile"` clean. `sbt "WorkflowOperator/testOnly org.apache.texera.amber.operator.huggingFace.HuggingFaceInferenceOpDescSpec org.apache.texera.amber.util.PythonCodeRawInvalidTextSpec"` — 31 focused tests pass, including HuggingFace QA/ranking task coverage and the raw Python descriptor scan. `sbt "WorkflowOperator / scalafmtCheck"` clean. `sbt "WorkflowOperator / Test / scalafmtCheck"` clean. `PythonCodeRawInvalidTextSpec` — 117/117 descriptors py_compile cleanly with the new operator code paths, no marker leaks. ## Was this PR authored or co-authored using generative AI tooling? Yes, co-authored with generative AI tooling (Codex). --- .../huggingFace/HuggingFaceInferenceOpDesc.scala | 32 ++++++++- .../huggingFace/codegen/PythonCodegenBase.scala | 34 +++++++++ .../huggingFace/codegen/QaRankingCodegen.scala | 84 ++++++++++++++++++++++ .../operator/huggingFace/codegen/TaskCodegen.scala | 5 +- .../HuggingFaceInferenceOpDescSpec.scala | 83 ++++++++++++++++++++- 5 files changed, 235 insertions(+), 3 deletions(-) diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDesc.scala index f7805266cf..0e1062c75a 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDesc.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDesc.scala @@ -30,6 +30,7 @@ import org.apache.texera.amber.operator.huggingFace.codegen.{ ImageTaskCodegen, MediaGenCodegen, PythonCodegenBase, + QaRankingCodegen, TaskCodegen, TextGenCodegen } @@ -108,6 +109,25 @@ class HuggingFaceInferenceOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var inputAudioColumn: EncodableString = "" + @JsonProperty(value = "contextColumn", required = false) + @JsonSchemaTitle("Context Column") + @JsonPropertyDescription("Column containing the context passage for question answering") + @AutofillAttributeName + var contextColumn: EncodableString = "" + + @JsonProperty(value = "candidateLabels", required = false) + @JsonSchemaTitle("Candidate Labels") + @JsonPropertyDescription("Comma-separated candidate labels for zero-shot classification") + var candidateLabels: EncodableString = "" + + @JsonProperty(value = "sentencesColumn", required = false) + @JsonSchemaTitle("Sentences Column") + @JsonPropertyDescription( + "Column with comma-separated sentences for sentence similarity and text ranking" + ) + @AutofillAttributeName + var sentencesColumn: EncodableString = "" + @JsonProperty( value = "systemPrompt", required = false, @@ -153,6 +173,7 @@ class HuggingFaceInferenceOpDesc extends PythonOperatorDescriptor { ImageTaskCodegen.tasks.foreach(t => byTask += (t -> ImageTaskCodegen)) AudioTaskCodegen.tasks.foreach(t => byTask += (t -> AudioTaskCodegen)) MediaGenCodegen.tasks.foreach(t => byTask += (t -> MediaGenCodegen)) + QaRankingCodegen.tasks.foreach(t => byTask += (t -> QaRankingCodegen)) byTask.toMap } @@ -200,6 +221,12 @@ class HuggingFaceInferenceOpDesc extends PythonOperatorDescriptor { if (audioInput == null) "" else audioInput val safeInputAudioColumn: EncodableString = if (inputAudioColumn == null) "" else inputAudioColumn + val safeContextColumn: EncodableString = + if (contextColumn == null) "" else contextColumn + val safeCandidateLabels: EncodableString = + if (candidateLabels == null) "" else candidateLabels + val safeSentencesColumn: EncodableString = + if (sentencesColumn == null) "" else sentencesColumn val ctx = CodegenContext( hfApiToken = safeToken, @@ -213,7 +240,10 @@ class HuggingFaceInferenceOpDesc extends PythonOperatorDescriptor { imageInput = safeImageInput, inputImageColumn = safeInputImageColumn, audioInput = safeAudioInput, - inputAudioColumn = safeInputAudioColumn + inputAudioColumn = safeInputAudioColumn, + contextColumn = safeContextColumn, + candidateLabels = safeCandidateLabels, + sentencesColumn = safeSentencesColumn ) PythonCodegenBase.render(ctx, codegenForTask(safeTask)) diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala index 8671b9a76a..7cd305bfb6 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala @@ -59,6 +59,9 @@ object PythonCodegenBase { val inputImageColumn = ctx.inputImageColumn val audioInput = ctx.audioInput val inputAudioColumn = ctx.inputAudioColumn + val contextColumn = ctx.contextColumn + val candidateLabels = ctx.candidateLabels + val sentencesColumn = ctx.sentencesColumn pyb"""import os |import re |import json @@ -141,6 +144,9 @@ object PythonCodegenBase { | self.INPUT_IMAGE_COLUMN = $inputImageColumn | self.AUDIO_INPUT = $audioInput | self.INPUT_AUDIO_COLUMN = $inputAudioColumn + | self.CONTEXT_COLUMN = $contextColumn + | self.CANDIDATE_LABELS = $candidateLabels + | self.SENTENCES_COLUMN = $sentencesColumn | | def _resolve_providers(self, token): | '''Query the HF Hub API for inference providers serving this model. @@ -491,6 +497,24 @@ object PythonCodegenBase { | f"Prompt column '{prompt_col}' not found in input table. " | f"Available columns: {list(table.columns)}" | ) + | if task == "zero-shot-classification": + | labels = [l.strip() for l in str(self.CANDIDATE_LABELS).split(",") if l.strip()] + | assert labels, ( + | "Candidate Labels are required for zero-shot-classification. " + | "Provide a comma-separated list of labels." + | ) + | if task == "question-answering": + | ctx_col = self.CONTEXT_COLUMN + | assert ctx_col and ctx_col in table.columns, ( + | f"Context column '{ctx_col}' not found in input table. " + | f"Available columns: {list(table.columns)}" + | ) + | if task in ("sentence-similarity", "text-ranking"): + | sent_col = self.SENTENCES_COLUMN + | assert sent_col and sent_col in table.columns, ( + | f"Sentences column '{sent_col}' not found in input table. " + | f"Available columns: {list(table.columns)}" + | ) | | # --- handle empty table --- | if table.empty: @@ -506,6 +530,16 @@ object PythonCodegenBase { | "Authorization": f"Bearer {token}", | "Content-Type": "application/octet-stream", | } + | # --- pre-compute table dict for table-question-answering --- + | table_dict = None + | if task == "table-question-answering": + | table_dict = {} + | for col in table.columns: + | if col != prompt_col and col != result_col: + | table_dict[col] = [ + | str(v) if not pd.isna(v) else "" for v in table[col].tolist() + | ] + | | # --- resolve image source (upload or column) for image tasks --- | has_image_upload = bool(self.IMAGE_INPUT) and bool(str(self.IMAGE_INPUT).strip()) | use_image_column = not has_image_upload and bool(self.INPUT_IMAGE_COLUMN) and self.INPUT_IMAGE_COLUMN in table.columns diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/QaRankingCodegen.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/QaRankingCodegen.scala new file mode 100644 index 0000000000..18c6897499 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/QaRankingCodegen.scala @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.huggingFace.codegen + +/** + * Codegen for question-answering, zero-shot, similarity, and ranking tasks. + * + * These tasks are prompt-driven but need extra per-row or per-operator + * inputs: context text, candidate labels, table contents, or a list of + * comparison sentences/documents. + */ +object QaRankingCodegen extends TaskCodegen { + + override val task: String = "question-answering" + + override val tasks: Set[String] = Set( + "question-answering", + "table-question-answering", + "zero-shot-classification", + "sentence-similarity", + "text-ranking" + ) + + override def payloadPython(ctx: CodegenContext): String = + """ if task == "question-answering": + | ctx_val = row[self.CONTEXT_COLUMN] + | ctx_val = "" if pd.isna(ctx_val) else str(ctx_val) + | payload = {"inputs": {"question": prompt_value, "context": ctx_val}} + | elif task == "table-question-answering": + | payload = {"inputs": {"query": prompt_value, "table": table_dict}} + | elif task == "zero-shot-classification": + | labels = [l.strip() for l in str(self.CANDIDATE_LABELS).split(",") if l.strip()] + | payload = { + | "inputs": prompt_value, + | "parameters": {"candidate_labels": labels}, + | } + | elif task == "sentence-similarity": + | sent_val = row[self.SENTENCES_COLUMN] + | sent_val = "" if pd.isna(sent_val) else str(sent_val) + | sentences_list = [s.strip() for s in sent_val.split(",") if s.strip()] + | payload = { + | "inputs": { + | "source_sentence": prompt_value, + | "sentences": sentences_list, + | } + | } + | elif task == "text-ranking": + | sent_val = row[self.SENTENCES_COLUMN] + | sent_val = "" if pd.isna(sent_val) else str(sent_val) + | sentences_list = [s.strip() for s in sent_val.split(",") if s.strip()] + | payload = { + | "inputs": { + | "query": prompt_value, + | "texts": sentences_list, + | } + | } + | else: + | payload = {"inputs": prompt_value}""".stripMargin + + override def parsePython(ctx: CodegenContext): String = + """ if task == "question-answering": + | return body.get("answer", json.dumps(body)) if isinstance(body, dict) else json.dumps(body) + | elif task == "table-question-answering": + | return body.get("answer", json.dumps(body)) if isinstance(body, dict) else json.dumps(body) + | elif task in ("zero-shot-classification", "sentence-similarity", "text-ranking"): + | return json.dumps(body)""".stripMargin +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/TaskCodegen.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/TaskCodegen.scala index 80bbcc58fc..8abcef721b 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/TaskCodegen.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/TaskCodegen.scala @@ -41,7 +41,10 @@ final case class CodegenContext( imageInput: EncodableString = "", inputImageColumn: EncodableString = "", audioInput: EncodableString = "", - inputAudioColumn: EncodableString = "" + inputAudioColumn: EncodableString = "", + contextColumn: EncodableString = "", + candidateLabels: EncodableString = "", + sentencesColumn: EncodableString = "" ) /** diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDescSpec.scala index eb728945f3..83f6239903 100644 --- a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDescSpec.scala +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDescSpec.scala @@ -25,6 +25,7 @@ import org.apache.texera.amber.operator.huggingFace.codegen.{ AudioTaskCodegen, CodegenContext, MediaGenCodegen, + QaRankingCodegen, TextGenCodegen } import org.apache.texera.amber.operator.metadata.OperatorGroupConstants @@ -46,7 +47,10 @@ class HuggingFaceInferenceOpDescSpec extends AnyFlatSpec with Matchers { imageInput: EncodableString = "", inputImageColumn: EncodableString = "", audioInput: EncodableString = "", - inputAudioColumn: EncodableString = "" + inputAudioColumn: EncodableString = "", + contextColumn: EncodableString = "", + candidateLabels: EncodableString = "", + sentencesColumn: EncodableString = "" ): HuggingFaceInferenceOpDesc = { val desc = new HuggingFaceInferenceOpDesc() desc.hfApiToken = token @@ -61,6 +65,9 @@ class HuggingFaceInferenceOpDescSpec extends AnyFlatSpec with Matchers { desc.inputImageColumn = inputImageColumn desc.audioInput = audioInput desc.inputAudioColumn = inputAudioColumn + desc.contextColumn = contextColumn + desc.candidateLabels = candidateLabels + desc.sentencesColumn = sentencesColumn desc } @@ -163,6 +170,9 @@ class HuggingFaceInferenceOpDescSpec extends AnyFlatSpec with Matchers { desc.inputImageColumn = null desc.audioInput = null desc.inputAudioColumn = null + desc.contextColumn = null + desc.candidateLabels = null + desc.sentencesColumn = null val code = desc.generatePythonCode() code should include("class ProcessTableOperator(UDFTableOperator):") code should include("def open(self):") @@ -497,6 +507,77 @@ class HuggingFaceInferenceOpDescSpec extends AnyFlatSpec with Matchers { } } + "qa and ranking task family" should + "route question-answering through QaRankingCodegen with context-column validation" in { + val code = makeDesc(task = "question-answering", contextColumn = "context").generatePythonCode() + code should include("self.CONTEXT_COLUMN = ") + code should include("""if task == "question-answering":""") + code should include("ctx_col = self.CONTEXT_COLUMN") + code should include("Context column") + code should include("""payload = {"inputs": {"question": prompt_value, "context": ctx_val}}""") + code should include( + """return body.get("answer", json.dumps(body)) if isinstance(body, dict) else json.dumps(body)""" + ) + } + + it should "route table-question-answering with a precomputed table payload" in { + val code = makeDesc(task = "table-question-answering").generatePythonCode() + code should include("""if task == "table-question-answering":""") + code should include("table_dict = {}") + code should include("""payload = {"inputs": {"query": prompt_value, "table": table_dict}}""") + code should include( + """return body.get("answer", json.dumps(body)) if isinstance(body, dict) else json.dumps(body)""" + ) + } + + it should "route zero-shot-classification with candidate labels" in { + val code = + makeDesc(task = "zero-shot-classification", candidateLabels = "positive,negative") + .generatePythonCode() + code should include("self.CANDIDATE_LABELS = ") + code should include("""if task == "zero-shot-classification":""") + code should include( + "labels = [l.strip() for l in str(self.CANDIDATE_LABELS).split" + ) + code should include("Candidate Labels are required for zero-shot-classification.") + code should include("""elif task == "zero-shot-classification":""") + code should include("labels = [l.strip() for l in str(self.CANDIDATE_LABELS).split") + code should include(""""parameters": {"candidate_labels": labels}""") + } + + it should "route sentence-similarity and text-ranking with sentences-column validation" in { + Seq("sentence-similarity", "text-ranking").foreach { taskName => + val code = makeDesc(task = taskName, sentencesColumn = "sentences").generatePythonCode() + code should include("self.SENTENCES_COLUMN = ") + code should include("sent_col = self.SENTENCES_COLUMN") + code should include("Sentences column") + if (taskName == "sentence-similarity") { + code should include("""elif task == "sentence-similarity":""") + code should include(""""source_sentence": prompt_value""") + code should include(""""sentences": sentences_list""") + } else { + code should include("""elif task == "text-ranking":""") + code should include(""""query": prompt_value""") + code should include(""""texts": sentences_list""") + } + } + } + + it should "register all qa and ranking task strings under the dispatcher" in { + QaRankingCodegen.tasks should contain allOf ( + "question-answering", + "table-question-answering", + "zero-shot-classification", + "sentence-similarity", + "text-ranking" + ) + QaRankingCodegen.tasks.foreach { t => + val code = makeDesc(task = t, contextColumn = "context", sentencesColumn = "sentences") + .generatePythonCode() + code should include("""if task == "question-answering":""") + } + } + "getOutputSchemas" should "add the result column as a STRING to the inherited schema" in { val desc = makeDesc(resultColumn = "answer") val inputSchema = Schema().add("prompt", AttributeType.STRING)
