Re: [PR] feat(huggingFace): refactor operator into per-task codegen + text-generation [texera]

via GitHub Tue, 09 Jun 2026 17:51:13 -0700


Ma77Ball commented on code in PR #5278:
URL: https://github.com/apache/texera/pull/5278#discussion_r3383885473



##########
common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDesc.scala:
##########
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.huggingFace
+
+import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription}
+import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle
+import org.apache.texera.amber.core.tuple.{AttributeType, Schema}
+import org.apache.texera.amber.core.workflow.{InputPort, OutputPort, 
PortIdentity}
+import org.apache.texera.amber.operator.PythonOperatorDescriptor
+import org.apache.texera.amber.operator.huggingFace.codegen.{
+  CodegenContext,
+  PythonCodegenBase,
+  TaskCodegen,
+  TextGenCodegen
+}
+import 
org.apache.texera.amber.operator.metadata.annotations.AutofillAttributeName
+import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, 
OperatorInfo}
+import org.apache.texera.amber.pybuilder.PyStringTypes.EncodableString
+
+/**
+  * Generic Hugging Face inference operator.
+  *
+  * This is the first slice of a feature that will eventually cover ~20 HF
+  * pipeline tasks. PR 2 ships text-generation only; image, audio,
+  * media-generation, and QA task families land in subsequent PRs as new
+  * `TaskCodegen` implementations registered in `registeredCodegens`.
+  *
+  * The Python script that runs at execution time is assembled by
+  * `PythonCodegenBase.render(ctx, codegen)`, which composes the shared
+  * provider-fallback / request-loop infrastructure with the per-task
+  * payload + parse snippets supplied by the selected `TaskCodegen`.
+  *
+  * User-provided string fields are typed as [[EncodableString]] so the
+  * `pyb"..."` macro inside `PythonCodegenBase` emits them as
+  * base64-decoded expressions at runtime instead of raw Python literals —
+  * this is what allows the operator to satisfy
+  * `PythonCodeRawInvalidTextSpec`'s contract that arbitrary `@JsonProperty`
+  * values must not leak into generated source.
+  */
+class HuggingFaceInferenceOpDesc extends PythonOperatorDescriptor {
+
+  @JsonProperty(value = "hfApiToken", required = true)
+  @JsonSchemaTitle("HF API Token")
+  @JsonPropertyDescription(
+    "Your Hugging Face API token (from https://huggingface.co/settings/tokens)"
+  )
+  var hfApiToken: EncodableString = ""
+
+  @JsonProperty(value = "task", required = true, defaultValue = 
"text-generation")
+  @JsonSchemaTitle("Task")
+  @JsonPropertyDescription("The Hugging Face pipeline task type")
+  var task: EncodableString = "text-generation"
+
+  @JsonProperty(
+    value = "modelId",
+    required = true,
+    defaultValue = "Qwen/Qwen2.5-72B-Instruct"
+  )
+  @JsonSchemaTitle("Model")
+  @JsonPropertyDescription("Select a Hugging Face model")
+  var modelId: EncodableString = "Qwen/Qwen2.5-72B-Instruct"
+
+  @JsonProperty(value = "promptColumn", required = true)
+  @JsonSchemaTitle("Prompt Column")
+  @JsonPropertyDescription("Column in the input table to use as the user 
prompt")
+  @AutofillAttributeName
+  var promptColumn: EncodableString = ""
+
+  @JsonProperty(
+    value = "systemPrompt",
+    required = false,
+    defaultValue = "You are a helpful assistant."
+  )
+  @JsonSchemaTitle("System Prompt")
+  @JsonPropertyDescription("Optional system message to set model behavior")
+  var systemPrompt: EncodableString = "You are a helpful assistant."
+
+  @JsonProperty(value = "maxNewTokens", required = false, defaultValue = "256")
+  @JsonSchemaTitle("Max New Tokens")
+  @JsonPropertyDescription("Maximum number of tokens to generate (1-4096)")
+  var maxNewTokens: java.lang.Integer = 256
+
+  @JsonProperty(value = "temperature", required = false)
+  @JsonSchemaTitle("Temperature")
+  @JsonPropertyDescription("Sampling temperature (0.0 = deterministic, up to 
2.0)")
+  var temperature: java.lang.Double = 0.7
+
+  @JsonProperty(
+    value = "resultColumn",
+    required = false,
+    defaultValue = "hf_response"
+  )
+  @JsonSchemaTitle("Result Column Name")
+  @JsonPropertyDescription("Name of the new column added to the output table")
+  var resultColumn: EncodableString = "hf_response"
+
+  /**
+    * Per-task code generators. New entries are added as task families land
+    * in subsequent PRs (e.g. ImageTaskCodegen, AudioTaskCodegen, etc.).
+    *
+    * An unrecognized task string falls back to [[TextGenCodegen]]; the
+    * generated Python's `else` branch then produces a generic `{"inputs":
+    * prompt_value}` payload and the HF endpoint surfaces the real error at
+    * runtime. This matches the original monolithic operator's behavior and
+    * keeps `generatePythonCode` total (it never throws on arbitrary input,
+    * which is required by `PythonCodeRawInvalidTextSpec`).
+    */
+  private val registeredCodegens: Map[String, TaskCodegen] =
+    Map(TextGenCodegen.task -> TextGenCodegen)
+
+  private def codegenForTask(t: String): TaskCodegen =
+    registeredCodegens.getOrElse(t, TextGenCodegen)
+
+  override def generatePythonCode(): String = {
+    val safeTask: EncodableString =
+      if (task == null || task.trim.isEmpty) "text-generation" else task
+    val safeModelId: EncodableString =
+      if (modelId == null) "" else modelId.trim
+    val safePromptCol: EncodableString =
+      if (promptColumn == null) "" else promptColumn
+    val safeResultCol: EncodableString =
+      if (resultColumn == null || resultColumn.trim.isEmpty) "hf_response" 
else resultColumn
+    val safeSystemPrompt: EncodableString =
+      if (systemPrompt == null) "" else systemPrompt
+    val safeToken: EncodableString =
+      if (hfApiToken == null) "" else hfApiToken
+
+    val safeMaxTokens =
+      math.max(1, math.min(if (maxNewTokens != null) maxNewTokens.intValue 
else 256, 4096))
+    val safeTemp =
+      math.max(0.0, math.min(if (temperature != null) temperature.doubleValue 
else 0.7, 2.0))
+
+    val ctx = CodegenContext(
+      hfApiToken = safeToken,
+      modelId = safeModelId,
+      promptColumn = safePromptCol,
+      resultColumn = safeResultCol,
+      task = safeTask,
+      systemPrompt = safeSystemPrompt,
+      safeMaxTokens = safeMaxTokens,
+      safeTemp = safeTemp
+    )
+
+    PythonCodegenBase.render(ctx, codegenForTask(safeTask))
+  }
+
+  override def operatorInfo: OperatorInfo =
+    OperatorInfo(
+      "Hugging Face",
+      "Call a Hugging Face model via the Inference API",
+      OperatorGroupConstants.HUGGINGFACE_GROUP,
+      inputPorts = List(InputPort()),
+      outputPorts = List(OutputPort())
+    )
+
+  override def getOutputSchemas(
+      inputSchemas: Map[PortIdentity, Schema]
+  ): Map[PortIdentity, Schema] = {
+    val resCol =
+      if (resultColumn == null || resultColumn.trim.isEmpty) "hf_response"
+      else resultColumn

Review Comment:
   The `"hf_response"` default and the null/empty fallback are implemented here 
and again in `generatePythonCode` (line 139). If one changes, the two will 
silently drift. Consider a small shared helper used by both:
   
   ```scala
   private def resolvedResultColumn: String =
     if (resultColumn == null || resultColumn.trim.isEmpty) "hf_response" else 
resultColumn
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(huggingFace): refactor operator into per-task codegen + text-generation [texera]

Reply via email to