Copilot commented on code in PR #5260: URL: https://github.com/apache/texera/pull/5260#discussion_r3455042342
########## frontend/src/app/workspace/service/notebook-migration/migration-llm.ts: ########## @@ -0,0 +1,303 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { GuiConfigService } from "../../../common/service/gui-config.service"; +import { createOpenAI } from "@ai-sdk/openai"; +import { generateText, type ModelMessage } from "ai"; +import { AppSettings } from "../../../common/app-setting"; +import { v4 as uuidv4 } from "uuid"; +import { WorkflowUtilService } from "../workflow-graph/util/workflow-util.service"; +import { OperatorPredicate } from "../../types/workflow-common.interface"; Review Comment: The LiteLLM proxy under `/api/chat/completions` is protected by Texera's JWT (Authorization: Bearer <access-token>). Using `apiKey` here will send the user's model key (or the default `dummy`) as the Bearer token, which will 401 against the backend. ########## frontend/src/app/workspace/service/notebook-migration/migration-llm.ts: ########## @@ -0,0 +1,303 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { GuiConfigService } from "../../../common/service/gui-config.service"; +import { createOpenAI } from "@ai-sdk/openai"; +import { generateText, type ModelMessage } from "ai"; +import { AppSettings } from "../../../common/app-setting"; +import { v4 as uuidv4 } from "uuid"; +import { WorkflowUtilService } from "../workflow-graph/util/workflow-util.service"; +import { OperatorPredicate } from "../../types/workflow-common.interface"; +import { + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + WORKFLOW_PROMPT, + MAPPING_PROMPT, +} from "./migration-prompts"; + +interface Cell { + cell_type: string; + metadata: { [key: string]: any }; + source: string; +} + +export interface Notebook { + cells: Cell[]; +} + +interface WorkflowJSON { + operators: OperatorPredicate[]; + operatorPositions: Record<string, { x: number; y: number }>; + links: any[]; + commentBoxes: any[]; + settings: { + dataTransferBatchSize: number; + }; +} + +interface CombinedMapping { + operator_to_cell: Record<string, string[]>; + cell_to_operator: Record<string, string[]>; +} + +@Injectable() +export class NotebookMigrationLLM { + private model: any; + private messages: ModelMessage[] = []; + private initialized = false; + + private static readonly DOCUMENTATION: string[] = [ + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + ]; + + constructor( + private config: GuiConfigService, + private workflowUtilService: WorkflowUtilService + ) {} + + private get enabled(): boolean { + return this.config.env.pythonNotebookMigrationEnabled; + } + + private assertEnabled(): void { + if (!this.enabled) { + throw new Error("Notebook migration feature is disabled"); + } + } + + private parseJsonResponse(raw: string, context: string): any { + // Trim first, then strip optional markdown code fences (```json ... ``` or ``` ... ```) + const cleaned = raw + .trim() + .replace(/^```[a-zA-Z]*\s*/, "") + .replace(/\s*```$/, "") + .trim(); + try { + return JSON.parse(cleaned); + } catch (err) { + throw new Error(`Failed to parse LLM ${context} response as JSON: ${(err as Error).message}`); + } + } + + /** + * Initialize a new LLM session with Texera documentation + */ + public initialize(modelType: string = "gpt-5-mini", apiKey: string = "dummy"): void { + this.assertEnabled(); + this.model = createOpenAI({ + baseURL: new URL(`${AppSettings.getApiEndpoint()}`, document.baseURI).toString(), + // apiKey is required by the library for creating the OpenAI compatible client; + // For security reason, we store the apiKey at the backend, thus the value is dummy here. + apiKey: apiKey, + }).chat(modelType); + Review Comment: `initialize()` currently treats the caller-provided `apiKey` as the OpenAI Authorization token. Against Texera's `/api/chat/completions` proxy, this needs to be the Texera access token (JWT), otherwise all requests will be unauthorized. ########## frontend/src/app/workspace/service/notebook-migration/migration-llm.ts: ########## @@ -0,0 +1,303 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { GuiConfigService } from "../../../common/service/gui-config.service"; +import { createOpenAI } from "@ai-sdk/openai"; +import { generateText, type ModelMessage } from "ai"; +import { AppSettings } from "../../../common/app-setting"; +import { v4 as uuidv4 } from "uuid"; +import { WorkflowUtilService } from "../workflow-graph/util/workflow-util.service"; +import { OperatorPredicate } from "../../types/workflow-common.interface"; +import { + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + WORKFLOW_PROMPT, + MAPPING_PROMPT, +} from "./migration-prompts"; + +interface Cell { + cell_type: string; + metadata: { [key: string]: any }; + source: string; +} + +export interface Notebook { + cells: Cell[]; +} + +interface WorkflowJSON { + operators: OperatorPredicate[]; + operatorPositions: Record<string, { x: number; y: number }>; + links: any[]; + commentBoxes: any[]; + settings: { + dataTransferBatchSize: number; + }; +} Review Comment: `WorkflowJSON.settings` omits `executionMode`, but workflow settings in the frontend are defined as `{ dataTransferBatchSize, executionMode }`. Returning a workflow JSON without `executionMode` can break downstream consumers that assume the field exists. ########## frontend/src/app/workspace/service/notebook-migration/migration-llm.ts: ########## @@ -0,0 +1,303 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { GuiConfigService } from "../../../common/service/gui-config.service"; +import { createOpenAI } from "@ai-sdk/openai"; +import { generateText, type ModelMessage } from "ai"; +import { AppSettings } from "../../../common/app-setting"; +import { v4 as uuidv4 } from "uuid"; +import { WorkflowUtilService } from "../workflow-graph/util/workflow-util.service"; +import { OperatorPredicate } from "../../types/workflow-common.interface"; +import { + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + WORKFLOW_PROMPT, + MAPPING_PROMPT, +} from "./migration-prompts"; + +interface Cell { + cell_type: string; + metadata: { [key: string]: any }; + source: string; +} + +export interface Notebook { + cells: Cell[]; +} + +interface WorkflowJSON { + operators: OperatorPredicate[]; + operatorPositions: Record<string, { x: number; y: number }>; + links: any[]; + commentBoxes: any[]; + settings: { + dataTransferBatchSize: number; + }; +} + +interface CombinedMapping { + operator_to_cell: Record<string, string[]>; + cell_to_operator: Record<string, string[]>; +} + +@Injectable() +export class NotebookMigrationLLM { + private model: any; + private messages: ModelMessage[] = []; + private initialized = false; + + private static readonly DOCUMENTATION: string[] = [ + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + ]; + + constructor( + private config: GuiConfigService, + private workflowUtilService: WorkflowUtilService + ) {} + + private get enabled(): boolean { + return this.config.env.pythonNotebookMigrationEnabled; + } + + private assertEnabled(): void { + if (!this.enabled) { + throw new Error("Notebook migration feature is disabled"); + } + } + + private parseJsonResponse(raw: string, context: string): any { + // Trim first, then strip optional markdown code fences (```json ... ``` or ``` ... ```) + const cleaned = raw + .trim() + .replace(/^```[a-zA-Z]*\s*/, "") + .replace(/\s*```$/, "") + .trim(); + try { + return JSON.parse(cleaned); + } catch (err) { + throw new Error(`Failed to parse LLM ${context} response as JSON: ${(err as Error).message}`); + } + } + + /** + * Initialize a new LLM session with Texera documentation + */ + public initialize(modelType: string = "gpt-5-mini", apiKey: string = "dummy"): void { + this.assertEnabled(); + this.model = createOpenAI({ + baseURL: new URL(`${AppSettings.getApiEndpoint()}`, document.baseURI).toString(), + // apiKey is required by the library for creating the OpenAI compatible client; + // For security reason, we store the apiKey at the backend, thus the value is dummy here. + apiKey: apiKey, + }).chat(modelType); + + this.messages = [ + ...NotebookMigrationLLM.DOCUMENTATION.map( + (doc): ModelMessage => ({ + role: "system", + content: doc, + }) + ), + ]; + + this.initialized = true; + } + + /** + * Verify the connection to the LLM using the given API key + */ + public async verifyConnection(): Promise<boolean> { + if (!this.enabled) return false; + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + try { + await generateText({ + model: this.model, + messages: [ + { + role: "user", + content: "ping", + }, + ], + maxOutputTokens: 10, + }); + + return true; + } catch (err) { + console.error("API key verification failed:", err); + return false; + } + } + + /** + * Send a prompt and receive a response. + * All prior documentation and conversation is preserved. + */ + private async sendPrompt(prompt: string): Promise<string> { + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + this.messages.push({ + role: "user", + content: prompt, + }); + + const result = await generateText({ + model: this.model, + messages: this.messages, + }); + + this.messages.push({ + role: "assistant", + content: result.text, + }); + + return result.text; + } + + /** + * Send a Jupyter Notebook to be converted into a workflow and mapping. + */ + public async convertNotebookToWorkflow(notebook: Notebook): Promise<string> { + this.assertEnabled(); + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + const codeCells = notebook.cells.filter(cell => cell.cell_type === "code"); + const notebookString = codeCells + .map(cell => { + const uuid = String(cell.metadata.uuid); + return `# START ${uuid}\n${cell.source}\n# END ${uuid}`; + }) + .join("\n\n"); Review Comment: `String(cell.metadata.uuid)` turns missing UUIDs into the literal marker `undefined`. If multiple cells lack UUIDs, they will collide in the prompt and make the cell↔operator mapping ambiguous. This should be validated (or synthesized) before prompting. ########## frontend/src/app/workspace/service/notebook-migration/migration-llm.ts: ########## @@ -0,0 +1,303 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { GuiConfigService } from "../../../common/service/gui-config.service"; +import { createOpenAI } from "@ai-sdk/openai"; +import { generateText, type ModelMessage } from "ai"; +import { AppSettings } from "../../../common/app-setting"; +import { v4 as uuidv4 } from "uuid"; +import { WorkflowUtilService } from "../workflow-graph/util/workflow-util.service"; +import { OperatorPredicate } from "../../types/workflow-common.interface"; +import { + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + WORKFLOW_PROMPT, + MAPPING_PROMPT, +} from "./migration-prompts"; + +interface Cell { + cell_type: string; + metadata: { [key: string]: any }; + source: string; +} + +export interface Notebook { + cells: Cell[]; +} + +interface WorkflowJSON { + operators: OperatorPredicate[]; + operatorPositions: Record<string, { x: number; y: number }>; + links: any[]; + commentBoxes: any[]; + settings: { + dataTransferBatchSize: number; + }; +} + +interface CombinedMapping { + operator_to_cell: Record<string, string[]>; + cell_to_operator: Record<string, string[]>; +} + +@Injectable() +export class NotebookMigrationLLM { + private model: any; + private messages: ModelMessage[] = []; + private initialized = false; + + private static readonly DOCUMENTATION: string[] = [ + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + ]; + + constructor( + private config: GuiConfigService, + private workflowUtilService: WorkflowUtilService + ) {} + + private get enabled(): boolean { + return this.config.env.pythonNotebookMigrationEnabled; + } + + private assertEnabled(): void { + if (!this.enabled) { + throw new Error("Notebook migration feature is disabled"); + } + } + + private parseJsonResponse(raw: string, context: string): any { + // Trim first, then strip optional markdown code fences (```json ... ``` or ``` ... ```) + const cleaned = raw + .trim() + .replace(/^```[a-zA-Z]*\s*/, "") + .replace(/\s*```$/, "") + .trim(); + try { + return JSON.parse(cleaned); + } catch (err) { + throw new Error(`Failed to parse LLM ${context} response as JSON: ${(err as Error).message}`); + } + } Review Comment: `parseJsonResponse` only strips markdown fences when they are the very first/last tokens. In practice, LLMs often wrap JSON in prose (e.g. "Here is the JSON:```json…```"). Since this is a frontend feature, it's worth being more forgiving to avoid hard failures on minor formatting drift. ########## frontend/src/app/workspace/service/notebook-migration/migration-llm.spec.ts: ########## @@ -0,0 +1,232 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { NotebookMigrationLLM, Notebook } from "./migration-llm"; +import { GuiConfigService } from "../../../common/service/gui-config.service"; +import { WorkflowUtilService } from "../workflow-graph/util/workflow-util.service"; +import { generateText } from "ai"; +import type { Mock } from "vitest"; + +// The LLM transport and OpenAI client are mocked so the tests exercise only the +// deterministic transformation (parsing, operator/edge construction, cell<->operator mapping). +vi.mock("ai", () => ({ generateText: vi.fn() })); +vi.mock("@ai-sdk/openai", () => ({ + createOpenAI: vi.fn(() => ({ chat: vi.fn(() => ({})) })), +})); + +const mockGenerateText = generateText as unknown as Mock; + +describe("NotebookMigrationLLM", () => { + let opIdCounter = 0; + let stubUtil: WorkflowUtilService; + + // Build a fresh, initialized session with stubbed dependencies. The stubbed + // getNewOperatorPredicate hands out deterministic ids (PythonUDFV2-0, -1, ...). + function makeLLM(): NotebookMigrationLLM { + const stubConfig = { + env: { pythonNotebookMigrationEnabled: true }, + } as unknown as GuiConfigService; + + stubUtil = { + getNewOperatorPredicate: vi.fn((operatorType: string, customDisplayName?: string) => ({ + operatorID: `${operatorType}-${opIdCounter++}`, + operatorType, + operatorVersion: "test-version", + operatorProperties: { workers: 1, defaultEnv: true, envName: "" }, + inputPorts: [{ portID: "input-0", disallowMultiInputs: false }], + outputPorts: [{ portID: "output-0" }], + showAdvanced: false, + isDisabled: false, + customDisplayName, + dynamicInputPorts: true, + dynamicOutputPorts: true, + })), + } as unknown as WorkflowUtilService; + + const llm = new NotebookMigrationLLM(stubConfig, stubUtil); + llm.initialize(); + return llm; + } + + function codeCell(uuid: string | undefined, source: string) { + return { cell_type: "code", metadata: uuid === undefined ? {} : { uuid }, source }; + } + + // Queue the two responses convertNotebookToWorkflow consumes, in order. + function mockResponses(workflowResponse: string, mappingResponse: string) { + mockGenerateText.mockResolvedValueOnce({ text: workflowResponse }).mockResolvedValueOnce({ text: mappingResponse }); + } + + beforeEach(() => { + opIdCounter = 0; + mockGenerateText.mockReset(); + }); + + describe("convertNotebookToWorkflow", () => { + it("builds operators, links, positions, and a bidirectional mapping", async () => { + const notebook: Notebook = { + cells: [codeCell("CELL1", "print(1)"), codeCell("CELL2", "print(2)")], + }; + mockResponses( + JSON.stringify({ + code: { UDF1: "code1", UDF2: "code2" }, + edges: [["UDF1", "UDF2"]], + outputs: { UDF1: ["a", "b"], UDF2: ["c"] }, + }), + JSON.stringify({ UDF1: ["CELL1"], UDF2: ["CELL2"] }) + ); + + const { workflowJSON, workflowNotebookMapping } = JSON.parse(await makeLLM().convertNotebookToWorkflow(notebook)); + + expect(workflowJSON.operators.map((op: any) => op.operatorID)).toEqual(["PythonUDFV2-0", "PythonUDFV2-1"]); + expect(workflowJSON.operators[0].operatorProperties).toMatchObject({ + code: "code1", + retainInputColumns: false, + }); + expect(workflowJSON.operatorPositions).toEqual({ + "PythonUDFV2-0": { x: 140, y: 0 }, + "PythonUDFV2-1": { x: 280, y: 0 }, + }); + expect(workflowJSON.links).toHaveLength(1); + expect(workflowJSON.links[0].source).toEqual({ operatorID: "PythonUDFV2-0", portID: "output-0" }); + expect(workflowJSON.links[0].target).toEqual({ operatorID: "PythonUDFV2-1", portID: "input-0" }); + expect(workflowNotebookMapping.operator_to_cell).toEqual({ + "PythonUDFV2-0": ["CELL1"], + "PythonUDFV2-1": ["CELL2"], + }); + expect(workflowNotebookMapping.cell_to_operator).toEqual({ + CELL1: ["PythonUDFV2-0"], + CELL2: ["PythonUDFV2-1"], + }); + }); + + // NOTE (C2): the attributeType is currently hardcoded to "binary". If C2 lands as + // "LLM returns per-column types", update the expected attributeType values here. + it("declares output columns with attributeType binary", async () => { + const notebook: Notebook = { cells: [codeCell("CELL1", "x = 1")] }; + mockResponses( + JSON.stringify({ code: { UDF1: "code1" }, edges: [], outputs: { UDF1: ["a", "b"] } }), + JSON.stringify({ UDF1: ["CELL1"] }) + ); + + const { workflowJSON } = JSON.parse(await makeLLM().convertNotebookToWorkflow(notebook)); + + expect(workflowJSON.operators[0].operatorProperties.outputColumns).toEqual([ + { attributeName: "a", attributeType: "binary" }, + { attributeName: "b", attributeType: "binary" }, + ]); + }); + + it("maps multiple cells onto the same UDF, and one cell onto multiple UDFs", async () => { + const notebook: Notebook = { + cells: [codeCell("CELL1", "a"), codeCell("CELL2", "b")], + }; + mockResponses( + JSON.stringify({ code: { UDF1: "c1", UDF2: "c2" }, edges: [], outputs: {} }), + JSON.stringify({ UDF1: ["CELL1", "CELL2"], UDF2: ["CELL1"] }) + ); + + const { workflowNotebookMapping } = JSON.parse(await makeLLM().convertNotebookToWorkflow(notebook)); + + expect(workflowNotebookMapping.operator_to_cell).toEqual({ + "PythonUDFV2-0": ["CELL1", "CELL2"], + "PythonUDFV2-1": ["CELL1"], + }); + expect(workflowNotebookMapping.cell_to_operator).toEqual({ + CELL1: ["PythonUDFV2-0", "PythonUDFV2-1"], + CELL2: ["PythonUDFV2-0"], + }); + }); + + it("produces a link with an undefined endpoint when an edge references an unknown UDF id", async () => { + const notebook: Notebook = { cells: [codeCell("CELL1", "a")] }; + mockResponses( + JSON.stringify({ code: { UDF1: "c1" }, edges: [["UDF1", "UDFX"]], outputs: {} }), + JSON.stringify({ UDF1: ["CELL1"] }) + ); + + const { workflowJSON } = JSON.parse(await makeLLM().convertNotebookToWorkflow(notebook)); + + // Documents current behavior: udfMappingToUUID["UDFX"] is undefined. + expect(workflowJSON.links[0].source.operatorID).toBe("PythonUDFV2-0"); + expect(workflowJSON.links[0].target.operatorID).toBeUndefined(); + }); Review Comment: If `convertNotebookToWorkflow` is updated to validate edges and throw on unknown UDF ids, this test should assert the error rather than documenting a workflow link with `operatorID: undefined` (which is not a usable workflow). ########## frontend/src/app/workspace/service/notebook-migration/migration-llm.spec.ts: ########## @@ -0,0 +1,232 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { NotebookMigrationLLM, Notebook } from "./migration-llm"; +import { GuiConfigService } from "../../../common/service/gui-config.service"; +import { WorkflowUtilService } from "../workflow-graph/util/workflow-util.service"; +import { generateText } from "ai"; +import type { Mock } from "vitest"; + +// The LLM transport and OpenAI client are mocked so the tests exercise only the +// deterministic transformation (parsing, operator/edge construction, cell<->operator mapping). +vi.mock("ai", () => ({ generateText: vi.fn() })); +vi.mock("@ai-sdk/openai", () => ({ + createOpenAI: vi.fn(() => ({ chat: vi.fn(() => ({})) })), +})); + +const mockGenerateText = generateText as unknown as Mock; + +describe("NotebookMigrationLLM", () => { + let opIdCounter = 0; + let stubUtil: WorkflowUtilService; + + // Build a fresh, initialized session with stubbed dependencies. The stubbed + // getNewOperatorPredicate hands out deterministic ids (PythonUDFV2-0, -1, ...). + function makeLLM(): NotebookMigrationLLM { + const stubConfig = { + env: { pythonNotebookMigrationEnabled: true }, + } as unknown as GuiConfigService; + + stubUtil = { + getNewOperatorPredicate: vi.fn((operatorType: string, customDisplayName?: string) => ({ + operatorID: `${operatorType}-${opIdCounter++}`, + operatorType, + operatorVersion: "test-version", + operatorProperties: { workers: 1, defaultEnv: true, envName: "" }, + inputPorts: [{ portID: "input-0", disallowMultiInputs: false }], + outputPorts: [{ portID: "output-0" }], + showAdvanced: false, + isDisabled: false, + customDisplayName, + dynamicInputPorts: true, + dynamicOutputPorts: true, + })), + } as unknown as WorkflowUtilService; + + const llm = new NotebookMigrationLLM(stubConfig, stubUtil); + llm.initialize(); + return llm; + } + + function codeCell(uuid: string | undefined, source: string) { + return { cell_type: "code", metadata: uuid === undefined ? {} : { uuid }, source }; + } + + // Queue the two responses convertNotebookToWorkflow consumes, in order. + function mockResponses(workflowResponse: string, mappingResponse: string) { + mockGenerateText.mockResolvedValueOnce({ text: workflowResponse }).mockResolvedValueOnce({ text: mappingResponse }); + } + + beforeEach(() => { + opIdCounter = 0; + mockGenerateText.mockReset(); + }); + + describe("convertNotebookToWorkflow", () => { + it("builds operators, links, positions, and a bidirectional mapping", async () => { + const notebook: Notebook = { + cells: [codeCell("CELL1", "print(1)"), codeCell("CELL2", "print(2)")], + }; + mockResponses( + JSON.stringify({ + code: { UDF1: "code1", UDF2: "code2" }, + edges: [["UDF1", "UDF2"]], + outputs: { UDF1: ["a", "b"], UDF2: ["c"] }, + }), + JSON.stringify({ UDF1: ["CELL1"], UDF2: ["CELL2"] }) + ); + + const { workflowJSON, workflowNotebookMapping } = JSON.parse(await makeLLM().convertNotebookToWorkflow(notebook)); + + expect(workflowJSON.operators.map((op: any) => op.operatorID)).toEqual(["PythonUDFV2-0", "PythonUDFV2-1"]); + expect(workflowJSON.operators[0].operatorProperties).toMatchObject({ + code: "code1", + retainInputColumns: false, + }); + expect(workflowJSON.operatorPositions).toEqual({ + "PythonUDFV2-0": { x: 140, y: 0 }, + "PythonUDFV2-1": { x: 280, y: 0 }, + }); + expect(workflowJSON.links).toHaveLength(1); + expect(workflowJSON.links[0].source).toEqual({ operatorID: "PythonUDFV2-0", portID: "output-0" }); + expect(workflowJSON.links[0].target).toEqual({ operatorID: "PythonUDFV2-1", portID: "input-0" }); + expect(workflowNotebookMapping.operator_to_cell).toEqual({ + "PythonUDFV2-0": ["CELL1"], + "PythonUDFV2-1": ["CELL2"], + }); + expect(workflowNotebookMapping.cell_to_operator).toEqual({ + CELL1: ["PythonUDFV2-0"], + CELL2: ["PythonUDFV2-1"], + }); + }); + + // NOTE (C2): the attributeType is currently hardcoded to "binary". If C2 lands as + // "LLM returns per-column types", update the expected attributeType values here. + it("declares output columns with attributeType binary", async () => { + const notebook: Notebook = { cells: [codeCell("CELL1", "x = 1")] }; + mockResponses( + JSON.stringify({ code: { UDF1: "code1" }, edges: [], outputs: { UDF1: ["a", "b"] } }), + JSON.stringify({ UDF1: ["CELL1"] }) + ); + + const { workflowJSON } = JSON.parse(await makeLLM().convertNotebookToWorkflow(notebook)); + + expect(workflowJSON.operators[0].operatorProperties.outputColumns).toEqual([ + { attributeName: "a", attributeType: "binary" }, + { attributeName: "b", attributeType: "binary" }, + ]); + }); + + it("maps multiple cells onto the same UDF, and one cell onto multiple UDFs", async () => { + const notebook: Notebook = { + cells: [codeCell("CELL1", "a"), codeCell("CELL2", "b")], + }; + mockResponses( + JSON.stringify({ code: { UDF1: "c1", UDF2: "c2" }, edges: [], outputs: {} }), + JSON.stringify({ UDF1: ["CELL1", "CELL2"], UDF2: ["CELL1"] }) + ); + + const { workflowNotebookMapping } = JSON.parse(await makeLLM().convertNotebookToWorkflow(notebook)); + + expect(workflowNotebookMapping.operator_to_cell).toEqual({ + "PythonUDFV2-0": ["CELL1", "CELL2"], + "PythonUDFV2-1": ["CELL1"], + }); + expect(workflowNotebookMapping.cell_to_operator).toEqual({ + CELL1: ["PythonUDFV2-0", "PythonUDFV2-1"], + CELL2: ["PythonUDFV2-0"], + }); + }); + + it("produces a link with an undefined endpoint when an edge references an unknown UDF id", async () => { + const notebook: Notebook = { cells: [codeCell("CELL1", "a")] }; + mockResponses( + JSON.stringify({ code: { UDF1: "c1" }, edges: [["UDF1", "UDFX"]], outputs: {} }), + JSON.stringify({ UDF1: ["CELL1"] }) + ); + + const { workflowJSON } = JSON.parse(await makeLLM().convertNotebookToWorkflow(notebook)); + + // Documents current behavior: udfMappingToUUID["UDFX"] is undefined. + expect(workflowJSON.links[0].source.operatorID).toBe("PythonUDFV2-0"); + expect(workflowJSON.links[0].target.operatorID).toBeUndefined(); + }); + + it("handles empty code, edges, and outputs", async () => { + const notebook: Notebook = { cells: [] }; + mockResponses(JSON.stringify({ code: {}, edges: [], outputs: {} }), JSON.stringify({})); + + const { workflowJSON, workflowNotebookMapping } = JSON.parse(await makeLLM().convertNotebookToWorkflow(notebook)); + + expect(workflowJSON.operators).toEqual([]); + expect(workflowJSON.links).toEqual([]); + expect(workflowNotebookMapping.operator_to_cell).toEqual({}); + expect(workflowNotebookMapping.cell_to_operator).toEqual({}); + }); + + it("emits the 'undefined' cell marker in the prompt when a code cell lacks metadata.uuid", async () => { + const notebook: Notebook = { cells: [codeCell(undefined, "print(1)")] }; + mockResponses( + JSON.stringify({ code: { UDF1: "c1" }, edges: [], outputs: {} }), + JSON.stringify({ UDF1: ["CELL1"] }) + ); + + await makeLLM().convertNotebookToWorkflow(notebook); + + // The notebook string (embedded in the workflow prompt) is sent to generateText. + // messages is a shared, mutated array, so search every message content rather than + // assuming a fixed index. + const allPromptContent = mockGenerateText.mock.calls + .flatMap(call => call[0].messages.map((m: any) => m.content)) + .join("\n"); + expect(allPromptContent).toContain("# START undefined"); + }); + }); + + describe("parseJsonResponse", () => { + // parseJsonResponse is private; cast to access it directly for focused coverage. + const parse = (raw: string) => (makeLLM() as any).parseJsonResponse(raw, "workflow"); + + it("parses bare JSON", () => { + expect(parse('{"a":1}')).toEqual({ a: 1 }); + }); + + it("strips a ```json fence", () => { + expect(parse('```json\n{"a":1}\n```')).toEqual({ a: 1 }); + }); + + it("strips a plain ``` fence", () => { + expect(parse('```\n{"a":1}\n```')).toEqual({ a: 1 }); + }); + + it("tolerates surrounding whitespace and newlines around the fence", () => { + expect(parse('\n\n ```json\n{"a":1}\n``` \n\n')).toEqual({ a: 1 }); + }); + + it("throws a contextual error on malformed JSON", () => { + expect(() => parse("not json")).toThrow("Failed to parse LLM workflow response as JSON"); + }); + + it("does not strip a fence preceded by prose (documents current limitation)", () => { + expect(() => parse('Here is the JSON: ```json\n{"a":1}\n```\nThanks!')).toThrow( + "Failed to parse LLM workflow response as JSON" + ); + }); Review Comment: If `parseJsonResponse` is made more forgiving (extracting fenced JSON even when surrounded by prose), update this test to assert successful parsing rather than expecting an error. ########## frontend/src/app/workspace/service/notebook-migration/migration-llm.ts: ########## @@ -0,0 +1,303 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { GuiConfigService } from "../../../common/service/gui-config.service"; +import { createOpenAI } from "@ai-sdk/openai"; +import { generateText, type ModelMessage } from "ai"; +import { AppSettings } from "../../../common/app-setting"; +import { v4 as uuidv4 } from "uuid"; +import { WorkflowUtilService } from "../workflow-graph/util/workflow-util.service"; +import { OperatorPredicate } from "../../types/workflow-common.interface"; +import { + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + WORKFLOW_PROMPT, + MAPPING_PROMPT, +} from "./migration-prompts"; + +interface Cell { + cell_type: string; + metadata: { [key: string]: any }; + source: string; +} Review Comment: `Cell.source` in `.ipynb` files is commonly an array of strings (lines). Treating it as a single string can produce prompts with comma-separated text and change semantics. Consider supporting both `string` and `string[]`. ########## frontend/src/app/workspace/service/notebook-migration/migration-llm.ts: ########## @@ -0,0 +1,303 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { GuiConfigService } from "../../../common/service/gui-config.service"; +import { createOpenAI } from "@ai-sdk/openai"; +import { generateText, type ModelMessage } from "ai"; +import { AppSettings } from "../../../common/app-setting"; +import { v4 as uuidv4 } from "uuid"; +import { WorkflowUtilService } from "../workflow-graph/util/workflow-util.service"; +import { OperatorPredicate } from "../../types/workflow-common.interface"; +import { + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + WORKFLOW_PROMPT, + MAPPING_PROMPT, +} from "./migration-prompts"; + +interface Cell { + cell_type: string; + metadata: { [key: string]: any }; + source: string; +} + +export interface Notebook { + cells: Cell[]; +} + +interface WorkflowJSON { + operators: OperatorPredicate[]; + operatorPositions: Record<string, { x: number; y: number }>; + links: any[]; + commentBoxes: any[]; + settings: { + dataTransferBatchSize: number; + }; +} + +interface CombinedMapping { + operator_to_cell: Record<string, string[]>; + cell_to_operator: Record<string, string[]>; +} + +@Injectable() +export class NotebookMigrationLLM { + private model: any; + private messages: ModelMessage[] = []; + private initialized = false; + + private static readonly DOCUMENTATION: string[] = [ + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + ]; + + constructor( + private config: GuiConfigService, + private workflowUtilService: WorkflowUtilService + ) {} + + private get enabled(): boolean { + return this.config.env.pythonNotebookMigrationEnabled; + } + + private assertEnabled(): void { + if (!this.enabled) { + throw new Error("Notebook migration feature is disabled"); + } + } + + private parseJsonResponse(raw: string, context: string): any { + // Trim first, then strip optional markdown code fences (```json ... ``` or ``` ... ```) + const cleaned = raw + .trim() + .replace(/^```[a-zA-Z]*\s*/, "") + .replace(/\s*```$/, "") + .trim(); + try { + return JSON.parse(cleaned); + } catch (err) { + throw new Error(`Failed to parse LLM ${context} response as JSON: ${(err as Error).message}`); + } + } + + /** + * Initialize a new LLM session with Texera documentation + */ + public initialize(modelType: string = "gpt-5-mini", apiKey: string = "dummy"): void { + this.assertEnabled(); + this.model = createOpenAI({ + baseURL: new URL(`${AppSettings.getApiEndpoint()}`, document.baseURI).toString(), + // apiKey is required by the library for creating the OpenAI compatible client; + // For security reason, we store the apiKey at the backend, thus the value is dummy here. + apiKey: apiKey, + }).chat(modelType); + + this.messages = [ + ...NotebookMigrationLLM.DOCUMENTATION.map( + (doc): ModelMessage => ({ + role: "system", + content: doc, + }) + ), + ]; + + this.initialized = true; + } + + /** + * Verify the connection to the LLM using the given API key + */ + public async verifyConnection(): Promise<boolean> { + if (!this.enabled) return false; + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + try { + await generateText({ + model: this.model, + messages: [ + { + role: "user", + content: "ping", + }, + ], + maxOutputTokens: 10, + }); + + return true; + } catch (err) { + console.error("API key verification failed:", err); + return false; + } + } + + /** + * Send a prompt and receive a response. + * All prior documentation and conversation is preserved. + */ + private async sendPrompt(prompt: string): Promise<string> { + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + this.messages.push({ + role: "user", + content: prompt, + }); + + const result = await generateText({ + model: this.model, + messages: this.messages, + }); + + this.messages.push({ + role: "assistant", + content: result.text, + }); + + return result.text; + } + + /** + * Send a Jupyter Notebook to be converted into a workflow and mapping. + */ + public async convertNotebookToWorkflow(notebook: Notebook): Promise<string> { + this.assertEnabled(); + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + const codeCells = notebook.cells.filter(cell => cell.cell_type === "code"); + const notebookString = codeCells + .map(cell => { + const uuid = String(cell.metadata.uuid); + return `# START ${uuid}\n${cell.source}\n# END ${uuid}`; + }) + .join("\n\n"); + + const workflow = await this.sendPrompt(`${WORKFLOW_PROMPT}\n${notebookString}`); + const mapping = await this.sendPrompt(MAPPING_PROMPT); + + // Remove ```json blocks and parse + const udfLLMResponse = this.parseJsonResponse(workflow, "workflow"); + + const workflowJSON: WorkflowJSON = { + operators: [], + operatorPositions: {}, + links: [], + commentBoxes: [], + settings: { + dataTransferBatchSize: 400, + }, + }; Review Comment: Workflow creation hardcodes `dataTransferBatchSize: 400` and does not set `executionMode`. The rest of the frontend uses GUI config defaults for both (see `defaultDataTransferBatchSize` / `defaultExecutionMode`), so the migrated workflow should use those defaults as well. ########## frontend/src/app/workspace/service/notebook-migration/migration-llm.ts: ########## @@ -0,0 +1,303 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { GuiConfigService } from "../../../common/service/gui-config.service"; +import { createOpenAI } from "@ai-sdk/openai"; +import { generateText, type ModelMessage } from "ai"; +import { AppSettings } from "../../../common/app-setting"; +import { v4 as uuidv4 } from "uuid"; +import { WorkflowUtilService } from "../workflow-graph/util/workflow-util.service"; +import { OperatorPredicate } from "../../types/workflow-common.interface"; +import { + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + WORKFLOW_PROMPT, + MAPPING_PROMPT, +} from "./migration-prompts"; + +interface Cell { + cell_type: string; + metadata: { [key: string]: any }; + source: string; +} + +export interface Notebook { + cells: Cell[]; +} + +interface WorkflowJSON { + operators: OperatorPredicate[]; + operatorPositions: Record<string, { x: number; y: number }>; + links: any[]; + commentBoxes: any[]; + settings: { + dataTransferBatchSize: number; + }; +} + +interface CombinedMapping { + operator_to_cell: Record<string, string[]>; + cell_to_operator: Record<string, string[]>; +} + +@Injectable() +export class NotebookMigrationLLM { + private model: any; + private messages: ModelMessage[] = []; + private initialized = false; + + private static readonly DOCUMENTATION: string[] = [ + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + ]; + + constructor( + private config: GuiConfigService, + private workflowUtilService: WorkflowUtilService + ) {} + + private get enabled(): boolean { + return this.config.env.pythonNotebookMigrationEnabled; + } + + private assertEnabled(): void { + if (!this.enabled) { + throw new Error("Notebook migration feature is disabled"); + } + } + + private parseJsonResponse(raw: string, context: string): any { + // Trim first, then strip optional markdown code fences (```json ... ``` or ``` ... ```) + const cleaned = raw + .trim() + .replace(/^```[a-zA-Z]*\s*/, "") + .replace(/\s*```$/, "") + .trim(); + try { + return JSON.parse(cleaned); + } catch (err) { + throw new Error(`Failed to parse LLM ${context} response as JSON: ${(err as Error).message}`); + } + } + + /** + * Initialize a new LLM session with Texera documentation + */ + public initialize(modelType: string = "gpt-5-mini", apiKey: string = "dummy"): void { + this.assertEnabled(); + this.model = createOpenAI({ + baseURL: new URL(`${AppSettings.getApiEndpoint()}`, document.baseURI).toString(), + // apiKey is required by the library for creating the OpenAI compatible client; + // For security reason, we store the apiKey at the backend, thus the value is dummy here. + apiKey: apiKey, + }).chat(modelType); + + this.messages = [ + ...NotebookMigrationLLM.DOCUMENTATION.map( + (doc): ModelMessage => ({ + role: "system", + content: doc, + }) + ), + ]; + + this.initialized = true; + } + + /** + * Verify the connection to the LLM using the given API key + */ + public async verifyConnection(): Promise<boolean> { + if (!this.enabled) return false; + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + try { + await generateText({ + model: this.model, + messages: [ + { + role: "user", + content: "ping", + }, + ], + maxOutputTokens: 10, + }); + + return true; + } catch (err) { + console.error("API key verification failed:", err); + return false; + } + } + + /** + * Send a prompt and receive a response. + * All prior documentation and conversation is preserved. + */ + private async sendPrompt(prompt: string): Promise<string> { + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + this.messages.push({ + role: "user", + content: prompt, + }); + + const result = await generateText({ + model: this.model, + messages: this.messages, + }); + + this.messages.push({ + role: "assistant", + content: result.text, + }); + + return result.text; + } + + /** + * Send a Jupyter Notebook to be converted into a workflow and mapping. + */ + public async convertNotebookToWorkflow(notebook: Notebook): Promise<string> { + this.assertEnabled(); + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + const codeCells = notebook.cells.filter(cell => cell.cell_type === "code"); + const notebookString = codeCells + .map(cell => { + const uuid = String(cell.metadata.uuid); + return `# START ${uuid}\n${cell.source}\n# END ${uuid}`; + }) + .join("\n\n"); + + const workflow = await this.sendPrompt(`${WORKFLOW_PROMPT}\n${notebookString}`); + const mapping = await this.sendPrompt(MAPPING_PROMPT); + + // Remove ```json blocks and parse + const udfLLMResponse = this.parseJsonResponse(workflow, "workflow"); + + const workflowJSON: WorkflowJSON = { + operators: [], + operatorPositions: {}, + links: [], + commentBoxes: [], + settings: { + dataTransferBatchSize: 400, + }, + }; + + const udfMappingToUUID: Record<string, string> = {}; + + Object.entries(udfLLMResponse.code).forEach(([udfId, udfCode], i) => { + let udfOutputColumns: { attributeName: string; attributeType: string }[] = []; + if (udfLLMResponse.outputs && udfLLMResponse.outputs[udfId]) { + udfOutputColumns = udfLLMResponse.outputs[udfId].map((attr: string) => ({ + attributeName: attr, + attributeType: "binary", + })); + } + + // Build the operator from the live PythonUDFV2 schema so the operatorVersion, ports, and + // property defaults track the backend definition, then overlay the generated code/outputs. + const base = this.workflowUtilService.getNewOperatorPredicate("PythonUDFV2", udfId); + const operator: OperatorPredicate = { + ...base, + operatorProperties: { + ...base.operatorProperties, + code: udfCode, + retainInputColumns: false, + outputColumns: udfOutputColumns, + }, + }; + + udfMappingToUUID[udfId] = operator.operatorID; + workflowJSON.operators.push(operator); + workflowJSON.operatorPositions[operator.operatorID] = { x: 140 * (i + 1), y: 0 }; + }); + + // Add links/edges + (udfLLMResponse.edges || []).forEach(([source, target]: [string, string]) => { + workflowJSON.links.push({ + linkID: `link-${uuidv4()}`, + source: { + operatorID: udfMappingToUUID[source], + portID: "output-0", + }, + target: { + operatorID: udfMappingToUUID[target], + portID: "input-0", + }, + }); + }); + + // Parse mapping + const parsedMapping: Record<string, string[]> = this.parseJsonResponse(mapping, "mapping"); + + const udfToCell: Record<string, string[]> = {}; + const cellToUdf: Record<string, string[]> = {}; + + Object.entries(parsedMapping).forEach(([udf, cells]) => { + const udfUUID = udfMappingToUUID[udf]; + udfToCell[udfUUID] = cells; + cells.forEach(cell => { + if (!cellToUdf[cell]) { + cellToUdf[cell] = [udfUUID]; + } else { + cellToUdf[cell].push(udfUUID); + } + }); + }); Review Comment: The mapping step assumes every key in the LLM response exists in `udfMappingToUUID`. If the LLM returns an extra/typo UDF id, this will create an `undefined` key in `operator_to_cell` and insert `undefined` into `cell_to_operator`. Validate the mapping keys before using them. ########## frontend/src/app/workspace/service/notebook-migration/migration-llm.ts: ########## @@ -0,0 +1,303 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { GuiConfigService } from "../../../common/service/gui-config.service"; +import { createOpenAI } from "@ai-sdk/openai"; +import { generateText, type ModelMessage } from "ai"; +import { AppSettings } from "../../../common/app-setting"; +import { v4 as uuidv4 } from "uuid"; +import { WorkflowUtilService } from "../workflow-graph/util/workflow-util.service"; +import { OperatorPredicate } from "../../types/workflow-common.interface"; +import { + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + WORKFLOW_PROMPT, + MAPPING_PROMPT, +} from "./migration-prompts"; + +interface Cell { + cell_type: string; + metadata: { [key: string]: any }; + source: string; +} + +export interface Notebook { + cells: Cell[]; +} + +interface WorkflowJSON { + operators: OperatorPredicate[]; + operatorPositions: Record<string, { x: number; y: number }>; + links: any[]; + commentBoxes: any[]; + settings: { + dataTransferBatchSize: number; + }; +} + +interface CombinedMapping { + operator_to_cell: Record<string, string[]>; + cell_to_operator: Record<string, string[]>; +} + +@Injectable() +export class NotebookMigrationLLM { + private model: any; + private messages: ModelMessage[] = []; + private initialized = false; + + private static readonly DOCUMENTATION: string[] = [ + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + ]; + + constructor( + private config: GuiConfigService, + private workflowUtilService: WorkflowUtilService + ) {} + + private get enabled(): boolean { + return this.config.env.pythonNotebookMigrationEnabled; + } + + private assertEnabled(): void { + if (!this.enabled) { + throw new Error("Notebook migration feature is disabled"); + } + } + + private parseJsonResponse(raw: string, context: string): any { + // Trim first, then strip optional markdown code fences (```json ... ``` or ``` ... ```) + const cleaned = raw + .trim() + .replace(/^```[a-zA-Z]*\s*/, "") + .replace(/\s*```$/, "") + .trim(); + try { + return JSON.parse(cleaned); + } catch (err) { + throw new Error(`Failed to parse LLM ${context} response as JSON: ${(err as Error).message}`); + } + } + + /** + * Initialize a new LLM session with Texera documentation + */ + public initialize(modelType: string = "gpt-5-mini", apiKey: string = "dummy"): void { + this.assertEnabled(); + this.model = createOpenAI({ + baseURL: new URL(`${AppSettings.getApiEndpoint()}`, document.baseURI).toString(), + // apiKey is required by the library for creating the OpenAI compatible client; + // For security reason, we store the apiKey at the backend, thus the value is dummy here. + apiKey: apiKey, + }).chat(modelType); + + this.messages = [ + ...NotebookMigrationLLM.DOCUMENTATION.map( + (doc): ModelMessage => ({ + role: "system", + content: doc, + }) + ), + ]; + + this.initialized = true; + } + + /** + * Verify the connection to the LLM using the given API key + */ + public async verifyConnection(): Promise<boolean> { + if (!this.enabled) return false; + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + try { + await generateText({ + model: this.model, + messages: [ + { + role: "user", + content: "ping", + }, + ], + maxOutputTokens: 10, + }); + + return true; + } catch (err) { + console.error("API key verification failed:", err); + return false; + } + } + + /** + * Send a prompt and receive a response. + * All prior documentation and conversation is preserved. + */ + private async sendPrompt(prompt: string): Promise<string> { + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + this.messages.push({ + role: "user", + content: prompt, + }); + + const result = await generateText({ + model: this.model, + messages: this.messages, + }); + + this.messages.push({ + role: "assistant", + content: result.text, + }); + + return result.text; + } + + /** + * Send a Jupyter Notebook to be converted into a workflow and mapping. + */ + public async convertNotebookToWorkflow(notebook: Notebook): Promise<string> { + this.assertEnabled(); + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + const codeCells = notebook.cells.filter(cell => cell.cell_type === "code"); + const notebookString = codeCells + .map(cell => { + const uuid = String(cell.metadata.uuid); + return `# START ${uuid}\n${cell.source}\n# END ${uuid}`; + }) + .join("\n\n"); + + const workflow = await this.sendPrompt(`${WORKFLOW_PROMPT}\n${notebookString}`); + const mapping = await this.sendPrompt(MAPPING_PROMPT); + + // Remove ```json blocks and parse + const udfLLMResponse = this.parseJsonResponse(workflow, "workflow"); + + const workflowJSON: WorkflowJSON = { + operators: [], + operatorPositions: {}, + links: [], + commentBoxes: [], + settings: { + dataTransferBatchSize: 400, + }, + }; + + const udfMappingToUUID: Record<string, string> = {}; + + Object.entries(udfLLMResponse.code).forEach(([udfId, udfCode], i) => { + let udfOutputColumns: { attributeName: string; attributeType: string }[] = []; + if (udfLLMResponse.outputs && udfLLMResponse.outputs[udfId]) { + udfOutputColumns = udfLLMResponse.outputs[udfId].map((attr: string) => ({ + attributeName: attr, + attributeType: "binary", + })); + } + + // Build the operator from the live PythonUDFV2 schema so the operatorVersion, ports, and + // property defaults track the backend definition, then overlay the generated code/outputs. + const base = this.workflowUtilService.getNewOperatorPredicate("PythonUDFV2", udfId); + const operator: OperatorPredicate = { + ...base, + operatorProperties: { + ...base.operatorProperties, + code: udfCode, + retainInputColumns: false, + outputColumns: udfOutputColumns, + }, + }; + + udfMappingToUUID[udfId] = operator.operatorID; + workflowJSON.operators.push(operator); + workflowJSON.operatorPositions[operator.operatorID] = { x: 140 * (i + 1), y: 0 }; + }); + + // Add links/edges + (udfLLMResponse.edges || []).forEach(([source, target]: [string, string]) => { + workflowJSON.links.push({ + linkID: `link-${uuidv4()}`, + source: { + operatorID: udfMappingToUUID[source], + portID: "output-0", + }, + target: { + operatorID: udfMappingToUUID[target], + portID: "input-0", + }, + }); + }); Review Comment: Edges from the LLM are currently accepted even when they reference unknown UDF ids, producing links with `operatorID: undefined` (and potentially breaking the workflow graph). It's safer to validate and fail fast (or skip with telemetry) when the LLM response is inconsistent. ########## frontend/src/app/workspace/service/notebook-migration/migration-llm.spec.ts: ########## @@ -0,0 +1,232 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { NotebookMigrationLLM, Notebook } from "./migration-llm"; +import { GuiConfigService } from "../../../common/service/gui-config.service"; +import { WorkflowUtilService } from "../workflow-graph/util/workflow-util.service"; +import { generateText } from "ai"; +import type { Mock } from "vitest"; + +// The LLM transport and OpenAI client are mocked so the tests exercise only the +// deterministic transformation (parsing, operator/edge construction, cell<->operator mapping). +vi.mock("ai", () => ({ generateText: vi.fn() })); +vi.mock("@ai-sdk/openai", () => ({ + createOpenAI: vi.fn(() => ({ chat: vi.fn(() => ({})) })), +})); + +const mockGenerateText = generateText as unknown as Mock; + +describe("NotebookMigrationLLM", () => { + let opIdCounter = 0; + let stubUtil: WorkflowUtilService; + + // Build a fresh, initialized session with stubbed dependencies. The stubbed + // getNewOperatorPredicate hands out deterministic ids (PythonUDFV2-0, -1, ...). + function makeLLM(): NotebookMigrationLLM { + const stubConfig = { + env: { pythonNotebookMigrationEnabled: true }, + } as unknown as GuiConfigService; + + stubUtil = { + getNewOperatorPredicate: vi.fn((operatorType: string, customDisplayName?: string) => ({ + operatorID: `${operatorType}-${opIdCounter++}`, + operatorType, + operatorVersion: "test-version", + operatorProperties: { workers: 1, defaultEnv: true, envName: "" }, + inputPorts: [{ portID: "input-0", disallowMultiInputs: false }], + outputPorts: [{ portID: "output-0" }], + showAdvanced: false, + isDisabled: false, + customDisplayName, + dynamicInputPorts: true, + dynamicOutputPorts: true, + })), + } as unknown as WorkflowUtilService; + + const llm = new NotebookMigrationLLM(stubConfig, stubUtil); + llm.initialize(); + return llm; Review Comment: After switching `initialize()` to use an access token (JWT) by default, these tests should pass an explicit dummy token to avoid depending on localStorage/AuthService state. ########## frontend/src/app/workspace/service/notebook-migration/migration-llm.spec.ts: ########## @@ -0,0 +1,232 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { NotebookMigrationLLM, Notebook } from "./migration-llm"; +import { GuiConfigService } from "../../../common/service/gui-config.service"; +import { WorkflowUtilService } from "../workflow-graph/util/workflow-util.service"; +import { generateText } from "ai"; +import type { Mock } from "vitest"; + +// The LLM transport and OpenAI client are mocked so the tests exercise only the +// deterministic transformation (parsing, operator/edge construction, cell<->operator mapping). +vi.mock("ai", () => ({ generateText: vi.fn() })); +vi.mock("@ai-sdk/openai", () => ({ + createOpenAI: vi.fn(() => ({ chat: vi.fn(() => ({})) })), +})); + +const mockGenerateText = generateText as unknown as Mock; + +describe("NotebookMigrationLLM", () => { + let opIdCounter = 0; + let stubUtil: WorkflowUtilService; + + // Build a fresh, initialized session with stubbed dependencies. The stubbed + // getNewOperatorPredicate hands out deterministic ids (PythonUDFV2-0, -1, ...). + function makeLLM(): NotebookMigrationLLM { + const stubConfig = { + env: { pythonNotebookMigrationEnabled: true }, + } as unknown as GuiConfigService; + + stubUtil = { + getNewOperatorPredicate: vi.fn((operatorType: string, customDisplayName?: string) => ({ + operatorID: `${operatorType}-${opIdCounter++}`, + operatorType, + operatorVersion: "test-version", + operatorProperties: { workers: 1, defaultEnv: true, envName: "" }, + inputPorts: [{ portID: "input-0", disallowMultiInputs: false }], + outputPorts: [{ portID: "output-0" }], + showAdvanced: false, + isDisabled: false, + customDisplayName, + dynamicInputPorts: true, + dynamicOutputPorts: true, + })), + } as unknown as WorkflowUtilService; + + const llm = new NotebookMigrationLLM(stubConfig, stubUtil); + llm.initialize(); + return llm; + } + + function codeCell(uuid: string | undefined, source: string) { + return { cell_type: "code", metadata: uuid === undefined ? {} : { uuid }, source }; + } + + // Queue the two responses convertNotebookToWorkflow consumes, in order. + function mockResponses(workflowResponse: string, mappingResponse: string) { + mockGenerateText.mockResolvedValueOnce({ text: workflowResponse }).mockResolvedValueOnce({ text: mappingResponse }); + } + + beforeEach(() => { + opIdCounter = 0; + mockGenerateText.mockReset(); + }); + + describe("convertNotebookToWorkflow", () => { + it("builds operators, links, positions, and a bidirectional mapping", async () => { + const notebook: Notebook = { + cells: [codeCell("CELL1", "print(1)"), codeCell("CELL2", "print(2)")], + }; + mockResponses( + JSON.stringify({ + code: { UDF1: "code1", UDF2: "code2" }, + edges: [["UDF1", "UDF2"]], + outputs: { UDF1: ["a", "b"], UDF2: ["c"] }, + }), + JSON.stringify({ UDF1: ["CELL1"], UDF2: ["CELL2"] }) + ); + + const { workflowJSON, workflowNotebookMapping } = JSON.parse(await makeLLM().convertNotebookToWorkflow(notebook)); + + expect(workflowJSON.operators.map((op: any) => op.operatorID)).toEqual(["PythonUDFV2-0", "PythonUDFV2-1"]); + expect(workflowJSON.operators[0].operatorProperties).toMatchObject({ + code: "code1", + retainInputColumns: false, + }); + expect(workflowJSON.operatorPositions).toEqual({ + "PythonUDFV2-0": { x: 140, y: 0 }, + "PythonUDFV2-1": { x: 280, y: 0 }, + }); + expect(workflowJSON.links).toHaveLength(1); + expect(workflowJSON.links[0].source).toEqual({ operatorID: "PythonUDFV2-0", portID: "output-0" }); + expect(workflowJSON.links[0].target).toEqual({ operatorID: "PythonUDFV2-1", portID: "input-0" }); + expect(workflowNotebookMapping.operator_to_cell).toEqual({ + "PythonUDFV2-0": ["CELL1"], + "PythonUDFV2-1": ["CELL2"], + }); + expect(workflowNotebookMapping.cell_to_operator).toEqual({ + CELL1: ["PythonUDFV2-0"], + CELL2: ["PythonUDFV2-1"], + }); + }); + + // NOTE (C2): the attributeType is currently hardcoded to "binary". If C2 lands as + // "LLM returns per-column types", update the expected attributeType values here. + it("declares output columns with attributeType binary", async () => { + const notebook: Notebook = { cells: [codeCell("CELL1", "x = 1")] }; + mockResponses( + JSON.stringify({ code: { UDF1: "code1" }, edges: [], outputs: { UDF1: ["a", "b"] } }), + JSON.stringify({ UDF1: ["CELL1"] }) + ); + + const { workflowJSON } = JSON.parse(await makeLLM().convertNotebookToWorkflow(notebook)); + + expect(workflowJSON.operators[0].operatorProperties.outputColumns).toEqual([ + { attributeName: "a", attributeType: "binary" }, + { attributeName: "b", attributeType: "binary" }, + ]); + }); + + it("maps multiple cells onto the same UDF, and one cell onto multiple UDFs", async () => { + const notebook: Notebook = { + cells: [codeCell("CELL1", "a"), codeCell("CELL2", "b")], + }; + mockResponses( + JSON.stringify({ code: { UDF1: "c1", UDF2: "c2" }, edges: [], outputs: {} }), + JSON.stringify({ UDF1: ["CELL1", "CELL2"], UDF2: ["CELL1"] }) + ); + + const { workflowNotebookMapping } = JSON.parse(await makeLLM().convertNotebookToWorkflow(notebook)); + + expect(workflowNotebookMapping.operator_to_cell).toEqual({ + "PythonUDFV2-0": ["CELL1", "CELL2"], + "PythonUDFV2-1": ["CELL1"], + }); + expect(workflowNotebookMapping.cell_to_operator).toEqual({ + CELL1: ["PythonUDFV2-0", "PythonUDFV2-1"], + CELL2: ["PythonUDFV2-0"], + }); + }); + + it("produces a link with an undefined endpoint when an edge references an unknown UDF id", async () => { + const notebook: Notebook = { cells: [codeCell("CELL1", "a")] }; + mockResponses( + JSON.stringify({ code: { UDF1: "c1" }, edges: [["UDF1", "UDFX"]], outputs: {} }), + JSON.stringify({ UDF1: ["CELL1"] }) + ); + + const { workflowJSON } = JSON.parse(await makeLLM().convertNotebookToWorkflow(notebook)); + + // Documents current behavior: udfMappingToUUID["UDFX"] is undefined. + expect(workflowJSON.links[0].source.operatorID).toBe("PythonUDFV2-0"); + expect(workflowJSON.links[0].target.operatorID).toBeUndefined(); + }); + + it("handles empty code, edges, and outputs", async () => { + const notebook: Notebook = { cells: [] }; + mockResponses(JSON.stringify({ code: {}, edges: [], outputs: {} }), JSON.stringify({})); + + const { workflowJSON, workflowNotebookMapping } = JSON.parse(await makeLLM().convertNotebookToWorkflow(notebook)); + + expect(workflowJSON.operators).toEqual([]); + expect(workflowJSON.links).toEqual([]); + expect(workflowNotebookMapping.operator_to_cell).toEqual({}); + expect(workflowNotebookMapping.cell_to_operator).toEqual({}); + }); + + it("emits the 'undefined' cell marker in the prompt when a code cell lacks metadata.uuid", async () => { + const notebook: Notebook = { cells: [codeCell(undefined, "print(1)")] }; + mockResponses( + JSON.stringify({ code: { UDF1: "c1" }, edges: [], outputs: {} }), + JSON.stringify({ UDF1: ["CELL1"] }) + ); + + await makeLLM().convertNotebookToWorkflow(notebook); + + // The notebook string (embedded in the workflow prompt) is sent to generateText. + // messages is a shared, mutated array, so search every message content rather than + // assuming a fixed index. + const allPromptContent = mockGenerateText.mock.calls + .flatMap(call => call[0].messages.map((m: any) => m.content)) + .join("\n"); + expect(allPromptContent).toContain("# START undefined"); + }); Review Comment: If `convertNotebookToWorkflow` is updated to require `metadata.uuid` (to avoid ambiguous `# START undefined` markers), this test should assert that the method rejects with a clear error instead of checking for the literal string `undefined` in the prompt. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
