Re: [PR] feat: add the Python Notebook Migration Tool [texera]

via GitHub Mon, 27 Apr 2026 13:05:36 -0700


Copilot commented on code in PR #4486:
URL: https://github.com/apache/texera/pull/4486#discussion_r3149874888



##########
frontend/src/app/workspace/service/jupyter-panel/jupyter-panel.service.ts:
##########
@@ -0,0 +1,263 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { Injectable } from "@angular/core";
+import { BehaviorSubject, catchError, map, of } from "rxjs";
+import { WorkflowActionService } from 
"../workflow-graph/model/workflow-action.service";
+import { OperatorLink } from "../../types/workflow-common.interface";
+import { HttpClient, HttpHeaders } from "@angular/common/http";
+import { UntilDestroy } from "@ngneat/until-destroy";
+import { NotificationService } from 
"src/app/common/service/notification/notification.service";
+import { distinctUntilChanged, switchMap } from "rxjs/operators";
+import { AppSettings } from "../../../common/app-setting";
+import { NotebookMigrationService } from 
"../notebook-migration/notebook-migration.service";
+
+@UntilDestroy()
+@Injectable({
+  providedIn: "root",
+})
+export class JupyterPanelService {
+  private jupyterNotebookPanelVisible = new BehaviorSubject<boolean>(false);
+  public jupyterNotebookPanelVisible$ = 
this.jupyterNotebookPanelVisible.asObservable();
+
+  private iframeRef: HTMLIFrameElement | null = null; // Store reference to 
iframe element
+  private cellContent: string[] = []; // Store the content of the cells
+  private highlightedCell: number | null = null; // Track the highlighted cell
+
+  // Precomputed dictionary for cell to highlight mapping
+  private cellToHighlightMapping: Record<string, { components: string[]; 
edges: string[] }> = {};
+
+  constructor(
+    private workflowActionService: WorkflowActionService,
+    private http: HttpClient,
+    private notificationService: NotificationService,
+    private notebookMigrationService: NotebookMigrationService
+  ) {
+    window.addEventListener("message", this.handleNotebookMessage);
+  }

Review Comment:
   The service registers a global `window.addEventListener('message', ...)` 
handler but never removes it. Even if this is a singleton, it can cause leaks / 
duplicate handlers in tests or if the provider scope changes; implement 
`OnDestroy` and call `removeEventListener` with the same handler.



##########
notebook-migration-service/build.sbt:
##########
@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import scala.collection.Seq
+
+name := "notebook-migration-service"
+
+
+enablePlugins(JavaAppPackaging)
+
+// Enable semanticdb for Scalafix
+ThisBuild / semanticdbEnabled := true
+ThisBuild / semanticdbVersion := scalafixSemanticdb.revision
+
+// Manage dependency conflicts by always using the latest revision
+ThisBuild / conflictManager := ConflictManager.latestRevision
+
+// Restrict parallel execution of tests to avoid conflicts
+Global / concurrentRestrictions += Tags.limit(Tags.Test, 1)
+
+/////////////////////////////////////////////////////////////////////////////
+// Compiler Options
+/////////////////////////////////////////////////////////////////////////////
+
+// Scala compiler options
+Compile / scalacOptions ++= Seq(
+  "-Xelide-below", "WARNING",       // Turn on optimizations with "WARNING" as 
the threshold
+  "-feature",                       // Check feature warnings
+  "-deprecation",                   // Check deprecation warnings
+  "-Ywarn-unused:imports"           // Check for unused imports
+)
+
+/////////////////////////////////////////////////////////////////////////////
+// Version Variables
+/////////////////////////////////////////////////////////////////////////////
+
+val dropwizardVersion = "4.0.7"
+val mockitoVersion = "5.4.0"
+val assertjVersion = "3.24.2"
+
+/////////////////////////////////////////////////////////////////////////////
+// Test-related Dependencies
+/////////////////////////////////////////////////////////////////////////////
+
+libraryDependencies ++= Seq(
+  "org.scalamock" %% "scalamock" % "5.2.0" % Test,                   // 
ScalaMock
+  "org.scalatest" %% "scalatest" % "3.2.17" % Test,                  // 
ScalaTest
+  "io.dropwizard" % "dropwizard-testing" % dropwizardVersion % Test, // 
Dropwizard Testing
+  "org.mockito" % "mockito-core" % mockitoVersion % Test,            // 
Mockito for mocking
+  "org.assertj" % "assertj-core" % assertjVersion % Test,            // 
AssertJ for assertions
+  "com.novocode" % "junit-interface" % "0.11" % Test                // SBT 
interface for JUnit
+)
+
+/////////////////////////////////////////////////////////////////////////////
+// Dependencies
+/////////////////////////////////////////////////////////////////////////////
+
+// Core Dependencies
+libraryDependencies ++= Seq(
+  "io.dropwizard" % "dropwizard-core" % dropwizardVersion,
+  "io.dropwizard" % "dropwizard-auth" % dropwizardVersion, // Dropwizard 
Authentication module
+  "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.15.2"

Review Comment:
   `notebook-migration-service` declares `jackson-module-scala` as `2.15.2`, 
but the rest of the repo standardizes on newer Jackson (currently `2.18.6`) and 
the root build already adds overrides. Please align this module’s Jackson 
dependency with the repo-wide version to avoid runtime classpath conflicts.
   ```suggestion
     "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.18.6"
   ```



##########
frontend/src/app/workspace/component/workflow-editor/workflow-editor.component.ts:
##########
@@ -618,12 +620,14 @@ export class WorkflowEditorComponent implements OnInit, 
AfterViewInit, OnDestroy
         const elementID = event[0].model.id.toString();
         const highlightedOperatorIDs = 
this.wrapper.getCurrentHighlightedOperatorIDs();
         const highlightedCommentBoxIDs = 
this.wrapper.getCurrentHighlightedCommentBoxIDs();
+        this.jupyterPanelService.onWorkflowComponentClick(elementID); // 
Highlight corresponding Jupyter notebook cell
         if (event[1].shiftKey) {
           // if in multiselect toggle highlights on click
           if (highlightedOperatorIDs.includes(elementID)) {
             this.workflowActionService.unhighlightOperators(elementID);
           } else if 
(this.workflowActionService.getTexeraGraph().hasOperator(elementID)) {
             
this.workflowActionService.highlightOperators(<boolean>event[1].shiftKey, 
elementID);
+            this.jupyterPanelService.onWorkflowComponentClick(elementID); // 
Highlight corresponding Jupyter notebook cell

Review Comment:
   `onWorkflowComponentClick` is invoked twice for the same click path (once 
unconditionally above, and again inside the `shiftKey` branch). This can cause 
duplicate postMessage/highlight work; please keep only one call per click.
   ```suggestion
   
   ```



##########
notebook-migration-service/src/main/resources/docker-compose.yml:
##########
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: texera-jupyter
+services:
+
+  jupyter:
+    build:
+      context: .
+      dockerfile: Dockerfile

Review Comment:
   `docker-compose.yml` specifies `dockerfile: Dockerfile`, but the repository 
file is named `dockerfile` (lowercase). On case-sensitive filesystems this will 
fail to build; rename the file to `Dockerfile` or update the compose reference 
to match the actual filename.
   ```suggestion
         dockerfile: dockerfile
   ```



##########
frontend/src/app/workspace/service/jupyter-panel/jupyter-panel.service.ts:
##########
@@ -0,0 +1,263 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { Injectable } from "@angular/core";
+import { BehaviorSubject, catchError, map, of } from "rxjs";
+import { WorkflowActionService } from 
"../workflow-graph/model/workflow-action.service";
+import { OperatorLink } from "../../types/workflow-common.interface";
+import { HttpClient, HttpHeaders } from "@angular/common/http";
+import { UntilDestroy } from "@ngneat/until-destroy";
+import { NotificationService } from 
"src/app/common/service/notification/notification.service";
+import { distinctUntilChanged, switchMap } from "rxjs/operators";
+import { AppSettings } from "../../../common/app-setting";
+import { NotebookMigrationService } from 
"../notebook-migration/notebook-migration.service";
+
+@UntilDestroy()
+@Injectable({
+  providedIn: "root",
+})
+export class JupyterPanelService {
+  private jupyterNotebookPanelVisible = new BehaviorSubject<boolean>(false);
+  public jupyterNotebookPanelVisible$ = 
this.jupyterNotebookPanelVisible.asObservable();
+
+  private iframeRef: HTMLIFrameElement | null = null; // Store reference to 
iframe element
+  private cellContent: string[] = []; // Store the content of the cells
+  private highlightedCell: number | null = null; // Track the highlighted cell
+
+  // Precomputed dictionary for cell to highlight mapping
+  private cellToHighlightMapping: Record<string, { components: string[]; 
edges: string[] }> = {};
+
+  constructor(
+    private workflowActionService: WorkflowActionService,
+    private http: HttpClient,
+    private notificationService: NotificationService,
+    private notebookMigrationService: NotebookMigrationService
+  ) {
+    window.addEventListener("message", this.handleNotebookMessage);
+  }
+
+  public init(): void {
+    this.workflowActionService
+      .workflowMetaDataChanged()
+      .pipe(
+        map(meta => meta.wid),
+        distinctUntilChanged()
+      )
+      .subscribe(wid => {
+        this.closeJupyterNotebookPanel();
+        if (wid != 0) {
+          this.fetchNotebookAndMapping(wid).subscribe(result => {
+            if (result == 1) {
+              this.precomputeHighlightMapping();
+              this.openJupyterNotebookPanel();
+            }
+          });
+        }
+      });
+  }
+
+  private fetchNotebookAndMapping(
+    workflowID: number | undefined = 
this.workflowActionService.getWorkflow().wid,
+    vId: number = 1
+  ) {
+    // Fetch mapping and notebook from migration database if exists for wid
+    const dbAPIUrl = 
`${AppSettings.getApiEndpoint()}/notebook-migration/fetch-notebook-and-mapping`;
+    const headers = new HttpHeaders({ "Content-Type": "application/json" });
+    const payload = {
+      wid: workflowID,
+      vid: vId, // Future work: add dynamic fetching of current workflow vId
+    };
+
+    return this.http.post(dbAPIUrl, payload, { headers }).pipe(
+      switchMap(async (response: any) => {
+        // Only load mapping and workflow if they exist
+        if (response.exists) {
+          this.notebookMigrationService.setMapping("mapping_wid_" + 
workflowID, response.mapping);
+
+          if ((await 
this.notebookMigrationService.sendNotebookToJupyter(response.notebook)) == 1) {
+            return 1;
+          } else {
+            return 0;
+          }
+        } else {
+          return 0;
+        }
+      }),
+      catchError((error: unknown) => {
+        console.error("Network response was not ok when fetching notebook and 
mapping:", error);
+        return of(0);
+      })
+    );
+  }
+
+  // Precompute the dictionary for O(1) highlighting
+  private precomputeHighlightMapping(): void {
+    const wid = this.workflowActionService.getWorkflow().wid;
+
+    if (wid === undefined) {
+      console.warn("Workflow ID is undefined. Cannot compute highlight 
mapping.");
+      return;
+    }
+    const mappingKey = "mapping_wid_" + wid;
+    const mapping = this.notebookMigrationService.getMapping(mappingKey);
+
+    if (mapping == undefined) {
+      console.warn(`Mapping key '${mappingKey}' not found. Cannot compute 
highlight mapping.`);
+      return;
+    }
+    const cellToOperator = mapping.cell_to_operator;
+
+    const allLinks: OperatorLink[] = 
this.workflowActionService.getTexeraGraph().getAllLinks();
+    if (allLinks.length === 0) {
+      console.warn("No links found in the graph during precompute.");
+      return;
+    }
+
+    for (const cellUUID in cellToOperator) {
+      const components = cellToOperator[cellUUID] || [];
+      const componentSet = new Set(components);
+      const edges: string[] = [];
+
+      allLinks.forEach(link => {
+        const sourceOperatorID = link.source.operatorID;
+        const targetOperatorID = link.target.operatorID;
+
+        if (
+          componentSet.has(sourceOperatorID) &&
+          componentSet.has(targetOperatorID) &&
+          sourceOperatorID !== targetOperatorID
+        ) {
+          edges.push(link.linkID);
+        }
+      });
+
+      this.cellToHighlightMapping[cellUUID] = { components, edges };
+    }
+  }
+
+  // Set the iframe reference (from the component's ViewChild)
+  setIframeRef(iframe: HTMLIFrameElement) {
+    this.iframeRef = iframe;
+  }
+
+  // Open the Jupyter Notebook panel
+  openPanel(panelName: string): void {
+    if (panelName === "JupyterNotebookPanel") {
+      this.jupyterNotebookPanelVisible.next(true);
+    }
+  }
+
+  // Close the Jupyter Notebook panel
+  closeJupyterNotebookPanel(): void {
+    this.jupyterNotebookPanelVisible.next(false);
+    const wid = this.workflowActionService.getWorkflow().wid;
+    if (wid != undefined) {
+      this.notebookMigrationService.deleteMapping("mapping_wid_" + wid);
+    }
+  }
+
+  // Minimize the Jupyter Notebook panel
+  public minimizeJupyterNotebookPanel(): void {
+    this.jupyterNotebookPanelVisible.next(false);
+  }
+
+  // Expand the Jupyter Notebook panel
+  public openJupyterNotebookPanel(): void {
+    const wid = this.workflowActionService.getWorkflow().wid;
+    const mappingKey = "mapping_wid_" + wid;
+    // Check if there is corresponding mapping data
+    if (wid === undefined || 
!this.notebookMigrationService.hasMapping(mappingKey)) {
+      this.notificationService.warning("No Jupyter notebook associated with 
this workflow.");
+      return;
+    }
+
+    // Expand only if the mapping exists
+    this.jupyterNotebookPanelVisible.next(true);
+  }
+
+  // Handle messages from the Jupyter notebook iframe
+  private handleNotebookMessage = async (event: MessageEvent) => {
+    const allowedOrigins = [window.location.origin, await 
this.notebookMigrationService.getJupyterURL()];
+    if (!allowedOrigins.includes(event.origin)) {
+      return;
+    }
+

Review Comment:
   `handleNotebookMessage` calls `await 
this.notebookMigrationService.getJupyterURL()` for every posted message, which 
triggers a network fetch each time a cell is clicked. Cache the Jupyter URL (or 
compute it once when opening the panel) so message handling stays O(1) and 
doesn’t depend on the network.



##########
frontend/src/app/workspace/component/workspace.component.ts:
##########
@@ -280,11 +296,48 @@ export class WorkspaceComponent implements AfterViewInit, 
OnInit, OnDestroy {
       .pipe(untilDestroyed(this))
       .subscribe();
   }
+
   public triggerCenter(): void {
     this.workflowActionService.getTexeraGraph().triggerCenterEvent();
   }
 
   public get copilotEnabled(): boolean {
     return this.config.env.copilotEnabled;
   }
+
+  onWaitingForLLMChanged(isWaiting: boolean) {
+    this.isWaitingForLLM = isWaiting;
+
+    if (isWaiting) {
+      this.startTimer();
+    } else {
+      this.stopTimer();
+    }
+  }
+
+  startTimer() {
+    this.startTime = Date.now();
+    this.updateElapsedTime();
+    this.timerInterval = setInterval(() => {
+      this.updateElapsedTime();
+    }, 1000);
+  }
+
+  stopTimer() {
+    clearInterval(this.timerInterval);
+    this.timerInterval = null;
+    this.startTime = null;
+  }

Review Comment:
   The timer interval is only cleared when `onWaitingForLLMChanged(false)` is 
called. If the component is destroyed while waiting (route change/tab close), 
the interval can continue running; ensure `ngOnDestroy` also calls 
`stopTimer()` (or otherwise clears `timerInterval`).



##########
frontend/src/app/workspace/service/notebook-migration/migration-prompts.ts:
##########
@@ -0,0 +1,410 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// TEXERA DOCUMENTATION
+
+// https://github.com/Texera/texera/wiki/Guide-to-Use-a-Python-UDF
+export const TEXERA_OVERVIEW = `
+You are a robust compiler that takes python code and translates it to our 
personal workflow environment Texera that uses python.
+
+  Texera is a data analytics tool that uses workflows to do machine learning 
and data analytics computation. User's are able to drag and drop operators and 
connect their inputs and outputs in a workflow graphical user interface, which 
the code we are going to create.
+
+Texera is able to use Python user defined functions. Documentation of a Python 
UDF in Texera follows:
+  Process Data APIs
+
+There are three APIs to process the data in different units.
+
+  Tuple API.
+
+  class ProcessTupleOperator(UDFOperatorV2):
+
+def process_tuple(self, tuple_: Tuple, port: int) -> 
Iterator[Optional[TupleLike]]:
+yield tuple_
+
+Tuple API takes one input tuple from a port at a time. It returns an iterator 
of optional TupleLike instances. A TupleLike is any data structure that 
supports key-value pairs, such as pytexera.Tuple, dict, defaultdict, 
NamedTuple, etc.
+
+  Tuple API is useful for implementing functional operations which are applied 
to tuples one by one, such as map, reduce, and filter.
+
+  Table API.
+
+  class ProcessTableOperator(UDFTableOperator):
+
+def process_table(self, table: Table, port: int) -> 
Iterator[Optional[TableLike]]:
+yield table
+
+Table API consumes a Table at a time, which consists of all the tuples from a 
port. It returns an iterator of optional TableLike instances. A TableLike is a 
collection of TupleLike, and currently, we support pytexera.Table and 
pandas.DataFrame as a TableLike instance. More flexible types will be supported 
down the road.
+
+  Table API is useful for implementing blocking operations that will consume 
all the data from one port, such as join, sort, and machine learning training.
+
+  Batch API.
+
+  class ProcessBatchOperator(UDFBatchOperator):
+
+BATCH_SIZE = 10
+
+def process_batch(self, batch: Batch, port: int) -> 
Iterator[Optional[BatchLike]]:
+yield batch
+
+Batch API consumes a batch of tuples at a time. Similar to Table, a Batch is 
also a collection of Tuples; however, its size is defined by the BATCH_SIZE, 
and one port can have multiple batches. It returns an iterator of optional 
BatchLike instances. A BatchLike is a collection of TupleLike, and currently, 
we support pytexera.Batch and pandas.DataFrame as a BatchLike instance. More 
flexible types will be supported down the road.
+
+  The Batch API serves as a hybrid API combining the features of both the 
Tuple and Table APIs. It is particularly valuable for striking a balance 
between time and space considerations, offering a trade-off that optimizes 
efficiency.
+
+  All three APIs can return an empty iterator by yield None.
+
+  The template code for a Python UDF follows: MAKE SURE TO USE THE CLASS NAMES 
AND FUNCTIONS DEFINED, THIS IS A MUST FOR THE PROGRAM TO WORK. SELECT 1 OUT OF 
THE 3 PROCESSING OPERATOR FUNCTIONS TO BUILD DEPENDINGO ON THE CONTEXT OF CODE 
TRANSLATION.
+# Choose from the following templates:
+  #
+# from pytexera import *
+#
+# class ProcessTupleOperator(UDFOperatorV2):
+#
+#     @overrides
+#     def process_tuple(self, tuple_: Tuple, port: int) -> 
Iterator[Optional[TupleLike]]:
+#         yield tuple_
+#
+# class ProcessBatchOperator(UDFBatchOperator):
+#     BATCH_SIZE = 10 # must be a positive integer
+#
+#     @overrides
+#     def process_batch(self, batch: Batch, port: int) -> 
Iterator[Optional[BatchLike]]:
+#         yield batch
+#
+# class ProcessTableOperator(UDFTableOperator):
+#
+#     @overrides
+#     def process_table(self, table: Table, port: int) -> 
Iterator[Optional[TableLike]]:
+#         yield table
+`;
+
+// 
https://github.com/Texera/texera/blob/1fa249a9d55d4dcad36d93e093c2faed5c4434f0/core/amber/src/main/python/core/models/tuple.py
+export const TUPLE_DOCUMENTATION = `
+### **<code>Tuple</code> Class Overview**
+
+The \`Tuple\` class is a **lazy-evaluated** data structure designed for 
efficient field storage and access. It provides:
+
+  1. **Support for Multiple Data Sources**:
+* Can be initialized from a \`TupleLike\` object, such as a \`pandas.Series\`, 
\`OrderedDict\`, or another \`Tuple\` instance.
+* Works with \`ArrowTableTupleProvider\` to access \`pyarrow.Table\` data.
+2. **Lazy Field Evaluation**:
+* Field values can be either **directly stored values** or **lazy accessors** 
(\`field_accessor\`).
+* If a field is accessed and is an accessor, it is evaluated and cached.
+3. **Schema (<code>Schema</code>) Enforcement**:
+  * A \`Tuple\` can be created without a schema but can be **finalized** with 
one using \`finalize(schema)\`, which:
+* **Casts field values** (e.g., \`NaN → None\`, \`Object → Bytes\`).
+* **Validates field completeness**, ensuring all fields match the \`Schema\`.
+4. **Pythonic Access Patterns**:
+* **Index-based access**: \`tuple["field_name"]\` or \`tuple[index]\` 
retrieves field values.
+* **Dictionary-like operations**: \`tuple.as_dict()\` returns an 
\`OrderedDict\`, and \`tuple.as_series()\` converts to a \`pandas.Series\`.
+* **Iterable support**: \`for field in tuple\` iterates over field values.
+5. **Hashing and Comparisons**:
+* Implements \`__hash__\` using a Java-like hashing algorithm, allowing usage 
as dictionary keys.
+* Implements \`__eq__\`, supporting equality checks based on field contents.
+6. **Partial Data Extraction**:
+* \`tuple.get_partial_tuple(attribute_names)\` returns a new \`Tuple\` 
instance containing only the specified fields.
+`;
+
+// 
https://github.com/Texera/texera/blob/1fa249a9d55d4dcad36d93e093c2faed5c4434f0/core/amber/src/main/python/core/models/table.py
+export const TABLE_DOCUMENTATION = `### **<code>Table</code> Class Overview**
+
+The \`Table\` class extends \`pandas.DataFrame\`, providing **structured 
Tuple-based data management**. It is designed to integrate seamlessly with 
\`Tuple\` objects.
+
+#### **Key Features:**
+
+1. **Flexible Construction:**
+* Can be initialized from various sources:
+* Another \`Table\` (\`from_table(table)\`)
+* A \`pandas.DataFrame\` (\`from_data_frame(df)\`)
+* A list/iterator of \`TupleLike\` objects (\`from_tuple_likes(tuple_likes)\`)
+* Ensures all \`Tuple\` objects in a \`Table\` have **consistent field names**.
+2. **Tuple Conversion:**
+* \`as_tuples()\`: Converts the table rows into an **iterator of 
<code>Tuple</code> instances**, preserving the row order.
+3. **Equality Comparison (<code>__eq__</code>):**
+* Supports **row-wise equality checks** by comparing the underlying \`Tuple\` 
objects.
+4. **Universal Tuple Output (<code>all_output_to_tuple</code>):**
+* A helper function to convert **various data types** into \`Tuple\` 
iterators, supporting:
+* \`None\` → \`[None]\`
+* \`Table\` → \`as_tuples()\`
+* \`pandas.DataFrame\` → Converted into a \`Table\`, then to Tuples
+* \`List[TupleLike]\` → Converted to \`Tuple\` instances
+* A single \`TupleLike\` or \`Tuple\` → Wrapped in an iterator
+
+#### **Relation to <code>Tuple</code>:**
+
+* \`Table\` **stores multiple <code>Tuple</code> objects** and ensures schema 
consistency across rows.
+* Provides an **efficient bridge** between \`Tuple\`-based data and 
\`pandas.DataFrame\`, enabling compatibility with Python's data analysis tools.
+`;
+
+// 
https://github.com/Texera/texera/blob/42d803310c180978a9f02992f0e05556796b293c/core/amber/src/main/python/core/models/operator.py
+export const OPERATOR_DOCUMENTATION = `### **Operator Class Overview**
+
+The \`Operator\` class is an **abstract base class (ABC)** for all operators, 
defining the fundamental structure for processing \`Tuple\`, \`Batch\`, and 
\`Table\` data in a workflow.
+
+#### **Key Features & Hierarchy**
+
+1. **Base <code>Operator</code> Class**:
+* Defines lifecycle methods: \`open()\` and \`close()\`.
+* Supports a **source flag (<code>is_source</code>)** to distinguish source 
operators from others.
+2. **Tuple-Based Processing (<code>TupleOperatorV2</code>)**:
+* Processes individual \`Tuple\` objects through \`process_tuple(tuple_, 
port)\`.
+* Calls \`on_finish(port)\` when an input port is exhausted.
+3. **Types of Operators**:
+* **SourceOperator**:
+* Produces data via \`produce()\`, yielding \`TupleLike\` or \`TableLike\` 
objects.
+* Overrides \`on_finish(port)\` to output produced data.
+* **BatchOperator**:
+* Collects tuples into batches (\`BATCH_SIZE\`) before processing via 
\`process_batch(batch, port)\`.
+* Converts processed batches (typically \`pandas.DataFrame\`) into \`Tuple\` 
output.
+* **TableOperator**:
+* Collects tuples into a \`Table\` before processing via 
\`process_table(table, port)\`.
+* Converts processed \`Table\` output back into tuples.
+4. **Data Flow & Processing**:
+* Operators receive data **tuple-by-tuple**, **batch-by-batch**, or 
**table-by-table** depending on the type.
+* Results are **iterators** of transformed data (\`TupleLike\`, \`BatchLike\`, 
or \`TableLike\`).
+5. **Deprecated <code>TupleOperator</code>**:
+* The older version of \`TupleOperator\` is deprecated in favor of 
\`TupleOperatorV2\`.
+
+#### Relation to <code>Tuple</code> and <code>Table</code>
+
+* Operators **consume and transform** \`Tuple\` and \`Table\` data within a 
workflow.
+* **Tuple-based operators** process row-wise, while **Table operators** handle 
structured table transformations.
+* **Source operators** initiate the data flow by generating tuples or tables.`;
+
+export const UDF_INPUT_PORT_DOCUMENTATION = `
+Python UDF operators support multiple input and output ports, allowing a 
single operator to receive different types of data from various upstream 
operators. In the process_tuple(self, tuple_: Tuple, port: int) function in 
ProcessTupleOperator and the process_table(self, table: Table, port: int) 
function in ProcessTableOperator, the port parameter indicates the input port. 
The port numbers are assigned in order, starting from 0 to N, from top to 
bottom. When input data have different schemas, it is necessary to assign them 
to different input ports. However, if all input data share the same schema, 
additional ports are not required. In both ProcessTupleOperator and 
ProcessTableOperator, there is an on_finish(self, port: int) function that is 
executed only after all the tuples from the specified port are processed.
+
+Using this knowledge, for situations where multiple upstream UDFs act as input 
to a single UDF, we can introduce an intermediary UDF that collects all of the 
input data and reformats it into a single table, which is then passed as input 
to the original next downstream UDF. When it is necessary for this to occur in 
your translation from notebook to UDFs, include the intermediary UDF and make 
sure that it and the next operator that uses its output is formatted correctly 
and handles the data transfer properly.
+`;
+
+export const EXAMPLE_OF_GOOD_CONVERSION = `
+Here is an example of python code translated into a compatible Texera UDF that 
gives output that abides the output schema compatible with the Texera workflow 
operators for tuples. Other operators do not always follow this strict format, 
but the yielding output structure is important.
+
+Python Code (high level idea): We have a python code that given some data, we 
limit the number of data.
+
+Texera Operator code:
+from pytexera import *
+
+class ProcessTupleOperator(UDFOperatorV2):
+def __init__(self):
+self.limit = 10
+self.count = 0
+@overrides
+def process_tuple(self, tuple_: Tuple, port: int) -> 
Iterator[Optional[TupleLike]]:
+if(self.count < self.limit):
+self.count += 1
+yield tuple_
+
+`;
+
+export const VISUALIZER_DOCUMENTATION = `
+Texera requires a unique way of generating visualizations from ML libraries:
+1. Ensures one yield per operator (per Texera’s UDF constraints).
+2. Uses Plotly for visualization and outputs results as embeddable HTML.
+3. Error handling is built-in to notify users when data is missing.
+`;
+
+export const EXAMPLE_OF_MULTIPLE_UDF_CONVERSION = `
+Here is an example of breaking up python code into multiple Texera UDFs. 
Format your response structure exactly like the given example. The "code" key 
contains a dictionary of the UDF ID's with their respective code. The "edges" 
key contains a list of pairs that contains the connections between UDFs. The 
"outputs" key contains a dictionary of the UDF ID's with a list of variable 
names that they yield in the UDF code. The UDFs can branch and merge, it does 
not have to be a linear chain depending on your implementation.
+
+Original Code:
+\`\`\`python
+# START CELL1
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score
+from sklearn.preprocessing import StandardScaler
+import matplotlib.pyplot as plt
+# END CELL1
+
+# START CELL2
+# Load the dataset
+file_path = 'diabetes.csv'
+data = pd.read_csv(file_path)
+# END CELL2
+
+# START CELL3
+# Remove duplicate rows
+data = data.drop_duplicates()
+
+# Remove rows with null values
+data = data.dropna()
+# END CELL3
+
+# START CELL4
+# Print the minimum, maximum, and mean for all fields
+print("Minimum values:\n", data.min())
+print("\nMaximum values:\n", data.max())
+print("\nMean values:\n", data.mean())
+# END CELL 4
+
+# START CELL5
+# Create a boxplot for the 'Pregnancies' field
+plt.figure(figsize=(8, 6))
+plt.boxplot(data['Pregnancies'], vert=False, patch_artist=True)
+plt.title('Boxplot of Pregnancies')
+plt.xlabel('Number of Pregnancies')
+plt.show()
+# END CELL5
+
+# START CELL6
+# Separate features and target variable
+X = data.drop('Outcome', axis=1)
+y = data['Outcome']
+# END CELL6
+
+# START CELL7
+# Split data into training and testing sets (80% train, 20% test)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
random_state=42)
+
+scaler = StandardScaler()
+X_train = scaler.fit_transform(X_train)
+X_test = scaler.transform(X_test)
+# END CELL7
+
+# START CELL8
+# Train Random Forest model
+rf_model = RandomForestClassifier(random_state=42)
+rf_model.fit(X_train, y_train)
+rf_pred = rf_model.predict(X_test)
+rf_accuracy = accuracy_score(y_test, rf_pred)
+print(f"Random Forest Accuracy: {rf_accuracy:.2%}")
+# END CELL8
+
+# START CELL9
+# Train SVM model
+svm_model = SVC(random_state=42)
+svm_model.fit(X_train, y_train)
+svm_pred = svm_model.predict(X_test)
+svm_accuracy = accuracy_score(y_test, svm_pred)
+print(f"SVM Accuracy: {svm_accuracy:.2%}")
+# END CELL9
+\`\`\`
+
+Texera UDF conversion:
+\`\`\`json
+{
+    "code": {
+        "UDF1": "# UDF1\nfrom pytexera import *\nimport pandas as pd\nfrom 
typing import Iterator, Optional\n\nclass 
ProcessTableOperator(UDFTableOperator):\n\n    @overrides\n    def 
process_table(self, table: Table, port: int) -> 
Iterator[Optional[TableLike]]:\n        # Remove duplicate rows\n        data = 
table.drop_duplicates()\n\n        # Remove rows with null values\n        data 
= data.dropna()\n\n        # Calculate statistics\n        min_values = 
data.min()\n        max_values = data.max()\n        mean_values = 
data.mean()\n\n        # Create a DataFrame to yield\n        result_table = 
pd.DataFrame({\n            'min_values': [min_values],\n            
'max_values': [max_values],\n            'mean_values': [mean_values],\n        
    'data': [data]\n        })\n\n        yield Table(result_table)",
+        "UDF2": "# UDF2\nfrom pytexera import *\nimport pandas as pd\nimport 
plotly.express as px\nimport plotly.io\nfrom typing import Iterator, 
Optional\n\nclass ProcessTableOperator(UDFTableOperator):\n    def 
render_error(self, error_msg):\n        return '''<h1>Boxplot is not 
available.</h1>\n                  <p>Reason is: {} </p>\n               
'''.format(error_msg)\n\n    @overrides\n    def process_table(self, table: 
Table, port: int) -> Iterator[Optional[TableLike]]:\n        data = 
table['data'].iloc[0]\n\n        if data.empty:\n            yield 
{'html-content': self.render_error('input table is empty.')}\n            
return\n\n        # Create a boxplot for the 'Pregnancies' field\n        fig = 
px.box(data, x='Pregnancies')\n        fig.update_layout(margin=dict(l=0, r=0, 
t=0, b=0))\n\n        # Convert fig to HTML content\n        html = 
plotly.io.to_html(fig, include_plotlyjs='cdn', auto_play=False)\n        yield 
{'html-content': html}",
+        "UDF3": "# UDF3\nfrom pytexera import *\nimport pandas as pd\nfrom 
sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing 
import StandardScaler\nfrom typing import Iterator, Optional\n\nclass 
ProcessTableOperator(UDFTableOperator):\n\n    @overrides\n    def 
process_table(self, table: Table, port: int) -> 
Iterator[Optional[TableLike]]:\n        data = table['data'].iloc[0]\n\n        
# Separate features and target variable\n        X = data.drop('Outcome', 
axis=1)\n        y = data['Outcome']\n\n        # Split data into training and 
testing sets (80% train, 20% test)\n        X_train, X_test, y_train, y_test = 
train_test_split(X, y, test_size=0.2, random_state=42)\n\n        scaler = 
StandardScaler()\n        X_train = scaler.fit_transform(X_train)\n        
X_test = scaler.transform(X_test)\n\n        # Create a DataFrame to yield\n    
    result_table = pd.DataFrame({\n            'X_train': [X_train], 'X_test': 
[X_test],\n            'y_train': [y_tra
 in], 'y_test': [y_test]\n        })\n\n        yield Table(result_table)",
+        "UDF4": "# UDF4\nfrom pytexera import *\nimport pandas as pd\nfrom 
sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import 
accuracy_score\nfrom typing import Iterator, Optional\n\nclass 
ProcessTableOperator(UDFTableOperator):\n\n    @overrides\n    def 
process_table(self, table: Table, port: int) -> 
Iterator[Optional[TableLike]]:\n        X_train = table['X_train'].iloc[0]\n    
    y_train = table['y_train'].iloc[0]\n        X_test = 
table['X_test'].iloc[0]\n        y_test = table['y_test'].iloc[0]\n\n        # 
Train Random Forest model\n        rf_model = 
RandomForestClassifier(random_state=42)\n        rf_model.fit(X_train, 
y_train)\n        rf_pred = rf_model.predict(X_test)\n        rf_accuracy = 
accuracy_score(y_test, rf_pred)\n\n        # Create a DataFrame to yield\n      
  result_table = pd.DataFrame({\n            'rf_model': [rf_model],\n          
  'rf_accuracy': [rf_accuracy],\n            'X_test': [X_test],\n            
'y_test': [y_test]\n 
        })\n\n        yield Table(result_table)",
+        "UDF5": "# UDF5\nfrom pytexera import *\nimport pandas as pd\nfrom 
sklearn.svm import SVC\nfrom sklearn.metrics import accuracy_score\nfrom typing 
import Iterator, Optional\n\nclass ProcessTableOperator(UDFTableOperator):\n\n  
  @overrides\n    def process_table(self, table: Table, port: int) -> 
Iterator[Optional[TableLike]]:\n        X_train = table['X_train'].iloc[0]\n    
    y_train = table['y_train'].iloc[0]\n        X_test = 
table['X_test'].iloc[0]\n        y_test = table['y_test'].iloc[0]\n\n        # 
Train SVM model\n        svm_model = SVC(random_state=42)\n        
svm_model.fit(X_train, y_train)\n        svm_pred = svm_model.predict(X_test)\n 
       svm_accuracy = accuracy_score(y_test, svm_pred)\n\n        # Create a 
DataFrame to yield\n        result_table = pd.DataFrame({\n            
'svm_model': [svm_model],\n            'svm_accuracy': [svm_accuracy],\n        
    'X_test': [X_test],\n            'y_test': [y_test]\n        })\n\n        
yield Table(result_tab
 le)"
+    },
+    "edges": [
+        ["UDF1", "UDF2"],
+        ["UDF1", "UDF3"],
+        ["UDF3", "UDF4"],
+        ["UDF3", "UDF5"]
+    ],
+    "outputs": {
+        "UDF1": ["min_values", "max_values", "mean_values", "data"],
+        "UDF2": ["html-content"],
+        "UDF3": ["X_train", "X_test", "y_train", "y_test"],
+        "UDF4": ["rf_model", "rf_accuracy", "X_test", "y_test"],
+        "UDF5": ["svm_model", "svm_accuracy", "X_test", "y_test"]
+    }
+}
+\`\`\`
+`;
+
+export const WORKFLOW_PROMPT = `You are an expert in Python coding and 
workflow systems.
+Many users of Texera system are non-technical, but the notebooks they provide 
are written by technical people.
+They want to convert their notebooks to Texera workflows.
+Your goal is to help convert these notebooks into a Texera workflow that 
non-technical users can use directly.
+So do not remove or modify any classes or functions, preserve their names and 
structure as they are.
+Ensure that all essential logic remains intact.
+Create multiple Texera UDF codes using the provided Python code.
+Number each UDF, starting at 1 and incrementing, by starting with a comment 
that states that UDF number.
+
+Use the class and function names as shown in ProcessTupleOperator, 
ProcessTableOperator, and ProcessBatchOperator.
+Do not change the class names, function names, or input parameters.
+Use the ones that make sense and split the code meaningfully as instructed.
+
+Use the starter code provided for Python UDFs.
+
+Use the documentation of Table, Tuple, or Batch to work with parameters within 
Texera UDF.
+Do not import other libraries to define these types.
+
+There is no need for an __init__ function. Assume all inputs are valid pandas 
DataFrames,
+so do not use .to_pandas(), .to_dataframe(), etc. Do not load data from a file 
in the first UDF, assume
+that the data is already given to you in the table parameter.
+Ensure proper data flow between functions. Separate operators as if they will 
run in different files.
+
+Current UDF operators can only have one output. Build a dataframe to yield all 
necessary variables
+and data. Ensure proper data flow for each UDF and all information is yielded 
(including training
+and testing data) if subsequent UDFs need them.
+
+Ensure all necessary imports are included in each UDF code block.
+
+Each UDF operator should be in its own Python code block. Do not combine them 
into a single block.
+Ensure import statements cover all used functions and separate them as 
necessary.
+
+It is VERY important that all of the original code in the Jupyter notebook is 
represented in the generated workflow.
+Make sure that nothing in the original is removed and that the semantic 
meaning of what the original code was doing is retained.
+If there are user-defined Python classes, include the entire class definition 
in the appropriate UDF(s) that use that class.
+Always include the code that defines the class inside of every distinct UDF 
that uses that constructs an object of that class.
+Python classes are allowed in Texera UDFs and follow the same semantics as 
standard Python.
+They can be defined outside of ProcessTableOperator, ProcessTupleOperator, and 
ProcessBatchOperator.
+
+Return only the JSON formatted response, do not give any explanation.
+Make sure the response is a valid JSON structure, including closing all braces 
and not including commas after the last element.
+Follow this JSON format (don't reuse the values, this is just the format). 
'code', 'edges', and 'outputs' are all their own key's, do not nest any of 
these in another one and make sure to close their braces:
+{
+"code": {
+"UDF1": "code for UDF1 goes here",
+"UDF2": "code for UDF2 goes here"
+},
+"edges": [
+["UDF1", "UDF2"]
+],
+"outputs": {
+"UDF1": ["min_values", "max_values", "mean_values", "data"],
+"UDF2": ["html-content"]
+}
+}
+Make sure only the keys in the code section appear in the edges and outputs 
sections. Do not include any extraneous fields.
+Do not include any extraneous UDF's in the code field that include empty 
strings.
+Give ALL of the code, do not omit anything or use placeholders for code. Make 
sure ALL code in the original is translated over.
+Use only unescaped single quotes inside of the code values for the UDF's, do 
not use escaped double quotes.
+Convert following the instructions and examples given. Here is the code:
+`;
+
+export const MAPPING_PROMPT = `
+Here is an example of a mapping generated between the given example Python 
code and the Texera UDFs using their CELL and UDF IDs. Cell IDs are designated 
by the UUID following '# START'. The format should be kept the same.
+{
+"UDF1": [
+"CEll3",
+"CELL4"
+],

Review Comment:
   The example mapping JSON in the prompt contains `"CEll3"` (typo / 
inconsistent casing). Since the mapping response is later parsed with 
`JSON.parse`, small inconsistencies in the example increase the chance the LLM 
returns unusable output; please correct the example to be consistent.



##########
notebook-migration-service/src/main/scala/org/apache/texera/service/resource/NotebookMigrationResource.scala:
##########
@@ -0,0 +1,358 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.texera.service.resource
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import com.typesafe.scalalogging.LazyLogging
+import jakarta.ws.rs._
+import jakarta.ws.rs.core._
+import org.apache.texera.dao.SqlServer
+import org.jooq.JSONB
+import org.apache.texera.dao.jooq.generated.tables.Notebook
+import org.apache.texera.dao.jooq.generated.tables.WorkflowNotebookMapping
+import java.net.{HttpURLConnection, URL}
+import java.nio.charset.StandardCharsets
+import scala.util.control.NonFatal
+import org.apache.texera.amber.config.StorageConfig
+
+object NotebookMigrationResource extends LazyLogging {
+
+  private val mapper: ObjectMapper = new 
ObjectMapper().registerModule(DefaultScalaModule)
+  private val jupyterUrl = StorageConfig.jupyterURL
+  private var jupyterIframeURL = s"$jupyterUrl/notebooks/work/notebook.ipynb"
+

Review Comment:
   `jupyterIframeURL` is a mutable global var. It’s updated per request and 
shared across all users/requests, which is not thread-safe and can cause 
cross-request leakage. Prefer computing the iframe URL per request (and/or 
store per-workflow/per-user state in DB) instead of keeping mutable singleton 
state.



##########
sql/updates/23.sql:
##########
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+-- ============================================
+-- 1. Connect to the texera_db database
+-- ============================================
+\c texera_db
+
+SET search_path TO texera_db;
+
+-- ============================================
+-- 2. Delete tables if they already exist
+-- ============================================
+
+BEGIN;
+
+DROP TABLE IF EXISTS notebook CASCADE;
+DROP TABLE IF EXISTS workflow_notebook_mapping CASCADE;
+

Review Comment:
   This migration drops the notebook/mapping tables unconditionally. Even if 
the update is intended to be run once, dropping tables can destroy existing 
data if the script is re-applied (e.g., in dev/test resets). Prefer creating 
the new tables without dropping, or at least avoid `CASCADE` and document/guard 
the destructive behavior.



##########
frontend/src/app/workspace/component/menu/menu.component.ts:
##########
@@ -175,6 +201,13 @@ export class MenuComponent implements OnInit, OnDestroy {
     // Subscribe to computing unit
     this.subscribeToComputingUnitSelection();
     this.subscribeToComputingUnitStatus();
+
+    this.importForm = this.fb.group({
+      description: [""],
+      file: [null, Validators.required],
+      model: [""],

Review Comment:
   `importForm` only requires `file`, so the modal can be submitted with an 
empty `model` value. That will pass an empty string into the LLM client and 
likely fail at runtime; make the model selection required (or set a safe 
default model when opening the modal).
   ```suggestion
         model: ["", Validators.required],
   ```



##########
frontend/src/app/workspace/service/notebook-migration/migration-prompts.ts:
##########
@@ -0,0 +1,410 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// TEXERA DOCUMENTATION
+
+// https://github.com/Texera/texera/wiki/Guide-to-Use-a-Python-UDF
+export const TEXERA_OVERVIEW = `
+You are a robust compiler that takes python code and translates it to our 
personal workflow environment Texera that uses python.
+
+  Texera is a data analytics tool that uses workflows to do machine learning 
and data analytics computation. User's are able to drag and drop operators and 
connect their inputs and outputs in a workflow graphical user interface, which 
the code we are going to create.
+
+Texera is able to use Python user defined functions. Documentation of a Python 
UDF in Texera follows:
+  Process Data APIs
+
+There are three APIs to process the data in different units.
+
+  Tuple API.
+
+  class ProcessTupleOperator(UDFOperatorV2):
+
+def process_tuple(self, tuple_: Tuple, port: int) -> 
Iterator[Optional[TupleLike]]:
+yield tuple_
+
+Tuple API takes one input tuple from a port at a time. It returns an iterator 
of optional TupleLike instances. A TupleLike is any data structure that 
supports key-value pairs, such as pytexera.Tuple, dict, defaultdict, 
NamedTuple, etc.
+
+  Tuple API is useful for implementing functional operations which are applied 
to tuples one by one, such as map, reduce, and filter.
+
+  Table API.
+
+  class ProcessTableOperator(UDFTableOperator):
+
+def process_table(self, table: Table, port: int) -> 
Iterator[Optional[TableLike]]:
+yield table
+
+Table API consumes a Table at a time, which consists of all the tuples from a 
port. It returns an iterator of optional TableLike instances. A TableLike is a 
collection of TupleLike, and currently, we support pytexera.Table and 
pandas.DataFrame as a TableLike instance. More flexible types will be supported 
down the road.
+
+  Table API is useful for implementing blocking operations that will consume 
all the data from one port, such as join, sort, and machine learning training.
+
+  Batch API.
+
+  class ProcessBatchOperator(UDFBatchOperator):
+
+BATCH_SIZE = 10
+
+def process_batch(self, batch: Batch, port: int) -> 
Iterator[Optional[BatchLike]]:
+yield batch
+
+Batch API consumes a batch of tuples at a time. Similar to Table, a Batch is 
also a collection of Tuples; however, its size is defined by the BATCH_SIZE, 
and one port can have multiple batches. It returns an iterator of optional 
BatchLike instances. A BatchLike is a collection of TupleLike, and currently, 
we support pytexera.Batch and pandas.DataFrame as a BatchLike instance. More 
flexible types will be supported down the road.
+
+  The Batch API serves as a hybrid API combining the features of both the 
Tuple and Table APIs. It is particularly valuable for striking a balance 
between time and space considerations, offering a trade-off that optimizes 
efficiency.
+
+  All three APIs can return an empty iterator by yield None.
+
+  The template code for a Python UDF follows: MAKE SURE TO USE THE CLASS NAMES 
AND FUNCTIONS DEFINED, THIS IS A MUST FOR THE PROGRAM TO WORK. SELECT 1 OUT OF 
THE 3 PROCESSING OPERATOR FUNCTIONS TO BUILD DEPENDINGO ON THE CONTEXT OF CODE 
TRANSLATION.
+# Choose from the following templates:
+  #
+# from pytexera import *
+#
+# class ProcessTupleOperator(UDFOperatorV2):
+#
+#     @overrides
+#     def process_tuple(self, tuple_: Tuple, port: int) -> 
Iterator[Optional[TupleLike]]:
+#         yield tuple_
+#
+# class ProcessBatchOperator(UDFBatchOperator):
+#     BATCH_SIZE = 10 # must be a positive integer
+#
+#     @overrides
+#     def process_batch(self, batch: Batch, port: int) -> 
Iterator[Optional[BatchLike]]:
+#         yield batch
+#
+# class ProcessTableOperator(UDFTableOperator):
+#
+#     @overrides
+#     def process_table(self, table: Table, port: int) -> 
Iterator[Optional[TableLike]]:
+#         yield table
+`;
+
+// 
https://github.com/Texera/texera/blob/1fa249a9d55d4dcad36d93e093c2faed5c4434f0/core/amber/src/main/python/core/models/tuple.py
+export const TUPLE_DOCUMENTATION = `
+### **<code>Tuple</code> Class Overview**
+
+The \`Tuple\` class is a **lazy-evaluated** data structure designed for 
efficient field storage and access. It provides:
+
+  1. **Support for Multiple Data Sources**:
+* Can be initialized from a \`TupleLike\` object, such as a \`pandas.Series\`, 
\`OrderedDict\`, or another \`Tuple\` instance.
+* Works with \`ArrowTableTupleProvider\` to access \`pyarrow.Table\` data.
+2. **Lazy Field Evaluation**:
+* Field values can be either **directly stored values** or **lazy accessors** 
(\`field_accessor\`).
+* If a field is accessed and is an accessor, it is evaluated and cached.
+3. **Schema (<code>Schema</code>) Enforcement**:
+  * A \`Tuple\` can be created without a schema but can be **finalized** with 
one using \`finalize(schema)\`, which:
+* **Casts field values** (e.g., \`NaN → None\`, \`Object → Bytes\`).
+* **Validates field completeness**, ensuring all fields match the \`Schema\`.
+4. **Pythonic Access Patterns**:
+* **Index-based access**: \`tuple["field_name"]\` or \`tuple[index]\` 
retrieves field values.
+* **Dictionary-like operations**: \`tuple.as_dict()\` returns an 
\`OrderedDict\`, and \`tuple.as_series()\` converts to a \`pandas.Series\`.
+* **Iterable support**: \`for field in tuple\` iterates over field values.
+5. **Hashing and Comparisons**:
+* Implements \`__hash__\` using a Java-like hashing algorithm, allowing usage 
as dictionary keys.
+* Implements \`__eq__\`, supporting equality checks based on field contents.
+6. **Partial Data Extraction**:
+* \`tuple.get_partial_tuple(attribute_names)\` returns a new \`Tuple\` 
instance containing only the specified fields.
+`;
+
+// 
https://github.com/Texera/texera/blob/1fa249a9d55d4dcad36d93e093c2faed5c4434f0/core/amber/src/main/python/core/models/table.py
+export const TABLE_DOCUMENTATION = `### **<code>Table</code> Class Overview**
+
+The \`Table\` class extends \`pandas.DataFrame\`, providing **structured 
Tuple-based data management**. It is designed to integrate seamlessly with 
\`Tuple\` objects.
+
+#### **Key Features:**
+
+1. **Flexible Construction:**
+* Can be initialized from various sources:
+* Another \`Table\` (\`from_table(table)\`)
+* A \`pandas.DataFrame\` (\`from_data_frame(df)\`)
+* A list/iterator of \`TupleLike\` objects (\`from_tuple_likes(tuple_likes)\`)
+* Ensures all \`Tuple\` objects in a \`Table\` have **consistent field names**.
+2. **Tuple Conversion:**
+* \`as_tuples()\`: Converts the table rows into an **iterator of 
<code>Tuple</code> instances**, preserving the row order.
+3. **Equality Comparison (<code>__eq__</code>):**
+* Supports **row-wise equality checks** by comparing the underlying \`Tuple\` 
objects.
+4. **Universal Tuple Output (<code>all_output_to_tuple</code>):**
+* A helper function to convert **various data types** into \`Tuple\` 
iterators, supporting:
+* \`None\` → \`[None]\`
+* \`Table\` → \`as_tuples()\`
+* \`pandas.DataFrame\` → Converted into a \`Table\`, then to Tuples
+* \`List[TupleLike]\` → Converted to \`Tuple\` instances
+* A single \`TupleLike\` or \`Tuple\` → Wrapped in an iterator
+
+#### **Relation to <code>Tuple</code>:**
+
+* \`Table\` **stores multiple <code>Tuple</code> objects** and ensures schema 
consistency across rows.
+* Provides an **efficient bridge** between \`Tuple\`-based data and 
\`pandas.DataFrame\`, enabling compatibility with Python's data analysis tools.
+`;
+
+// 
https://github.com/Texera/texera/blob/42d803310c180978a9f02992f0e05556796b293c/core/amber/src/main/python/core/models/operator.py
+export const OPERATOR_DOCUMENTATION = `### **Operator Class Overview**
+
+The \`Operator\` class is an **abstract base class (ABC)** for all operators, 
defining the fundamental structure for processing \`Tuple\`, \`Batch\`, and 
\`Table\` data in a workflow.
+
+#### **Key Features & Hierarchy**
+
+1. **Base <code>Operator</code> Class**:
+* Defines lifecycle methods: \`open()\` and \`close()\`.
+* Supports a **source flag (<code>is_source</code>)** to distinguish source 
operators from others.
+2. **Tuple-Based Processing (<code>TupleOperatorV2</code>)**:
+* Processes individual \`Tuple\` objects through \`process_tuple(tuple_, 
port)\`.
+* Calls \`on_finish(port)\` when an input port is exhausted.
+3. **Types of Operators**:
+* **SourceOperator**:
+* Produces data via \`produce()\`, yielding \`TupleLike\` or \`TableLike\` 
objects.
+* Overrides \`on_finish(port)\` to output produced data.
+* **BatchOperator**:
+* Collects tuples into batches (\`BATCH_SIZE\`) before processing via 
\`process_batch(batch, port)\`.
+* Converts processed batches (typically \`pandas.DataFrame\`) into \`Tuple\` 
output.
+* **TableOperator**:
+* Collects tuples into a \`Table\` before processing via 
\`process_table(table, port)\`.
+* Converts processed \`Table\` output back into tuples.
+4. **Data Flow & Processing**:
+* Operators receive data **tuple-by-tuple**, **batch-by-batch**, or 
**table-by-table** depending on the type.
+* Results are **iterators** of transformed data (\`TupleLike\`, \`BatchLike\`, 
or \`TableLike\`).
+5. **Deprecated <code>TupleOperator</code>**:
+* The older version of \`TupleOperator\` is deprecated in favor of 
\`TupleOperatorV2\`.
+
+#### Relation to <code>Tuple</code> and <code>Table</code>
+
+* Operators **consume and transform** \`Tuple\` and \`Table\` data within a 
workflow.
+* **Tuple-based operators** process row-wise, while **Table operators** handle 
structured table transformations.
+* **Source operators** initiate the data flow by generating tuples or tables.`;
+
+export const UDF_INPUT_PORT_DOCUMENTATION = `
+Python UDF operators support multiple input and output ports, allowing a 
single operator to receive different types of data from various upstream 
operators. In the process_tuple(self, tuple_: Tuple, port: int) function in 
ProcessTupleOperator and the process_table(self, table: Table, port: int) 
function in ProcessTableOperator, the port parameter indicates the input port. 
The port numbers are assigned in order, starting from 0 to N, from top to 
bottom. When input data have different schemas, it is necessary to assign them 
to different input ports. However, if all input data share the same schema, 
additional ports are not required. In both ProcessTupleOperator and 
ProcessTableOperator, there is an on_finish(self, port: int) function that is 
executed only after all the tuples from the specified port are processed.
+
+Using this knowledge, for situations where multiple upstream UDFs act as input 
to a single UDF, we can introduce an intermediary UDF that collects all of the 
input data and reformats it into a single table, which is then passed as input 
to the original next downstream UDF. When it is necessary for this to occur in 
your translation from notebook to UDFs, include the intermediary UDF and make 
sure that it and the next operator that uses its output is formatted correctly 
and handles the data transfer properly.
+`;
+
+export const EXAMPLE_OF_GOOD_CONVERSION = `
+Here is an example of python code translated into a compatible Texera UDF that 
gives output that abides the output schema compatible with the Texera workflow 
operators for tuples. Other operators do not always follow this strict format, 
but the yielding output structure is important.
+
+Python Code (high level idea): We have a python code that given some data, we 
limit the number of data.
+
+Texera Operator code:
+from pytexera import *
+
+class ProcessTupleOperator(UDFOperatorV2):
+def __init__(self):
+self.limit = 10
+self.count = 0
+@overrides
+def process_tuple(self, tuple_: Tuple, port: int) -> 
Iterator[Optional[TupleLike]]:
+if(self.count < self.limit):
+self.count += 1
+yield tuple_
+
+`;
+
+export const VISUALIZER_DOCUMENTATION = `
+Texera requires a unique way of generating visualizations from ML libraries:
+1. Ensures one yield per operator (per Texera’s UDF constraints).
+2. Uses Plotly for visualization and outputs results as embeddable HTML.
+3. Error handling is built-in to notify users when data is missing.
+`;
+
+export const EXAMPLE_OF_MULTIPLE_UDF_CONVERSION = `
+Here is an example of breaking up python code into multiple Texera UDFs. 
Format your response structure exactly like the given example. The "code" key 
contains a dictionary of the UDF ID's with their respective code. The "edges" 
key contains a list of pairs that contains the connections between UDFs. The 
"outputs" key contains a dictionary of the UDF ID's with a list of variable 
names that they yield in the UDF code. The UDFs can branch and merge, it does 
not have to be a linear chain depending on your implementation.
+
+Original Code:
+\`\`\`python
+# START CELL1
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score
+from sklearn.preprocessing import StandardScaler
+import matplotlib.pyplot as plt
+# END CELL1
+
+# START CELL2
+# Load the dataset
+file_path = 'diabetes.csv'
+data = pd.read_csv(file_path)
+# END CELL2
+
+# START CELL3
+# Remove duplicate rows
+data = data.drop_duplicates()
+
+# Remove rows with null values
+data = data.dropna()
+# END CELL3
+
+# START CELL4
+# Print the minimum, maximum, and mean for all fields
+print("Minimum values:\n", data.min())
+print("\nMaximum values:\n", data.max())
+print("\nMean values:\n", data.mean())
+# END CELL 4
+
+# START CELL5
+# Create a boxplot for the 'Pregnancies' field
+plt.figure(figsize=(8, 6))
+plt.boxplot(data['Pregnancies'], vert=False, patch_artist=True)
+plt.title('Boxplot of Pregnancies')
+plt.xlabel('Number of Pregnancies')
+plt.show()
+# END CELL5
+
+# START CELL6
+# Separate features and target variable
+X = data.drop('Outcome', axis=1)
+y = data['Outcome']
+# END CELL6
+
+# START CELL7
+# Split data into training and testing sets (80% train, 20% test)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
random_state=42)
+
+scaler = StandardScaler()
+X_train = scaler.fit_transform(X_train)
+X_test = scaler.transform(X_test)
+# END CELL7
+
+# START CELL8
+# Train Random Forest model
+rf_model = RandomForestClassifier(random_state=42)
+rf_model.fit(X_train, y_train)
+rf_pred = rf_model.predict(X_test)
+rf_accuracy = accuracy_score(y_test, rf_pred)
+print(f"Random Forest Accuracy: {rf_accuracy:.2%}")
+# END CELL8
+
+# START CELL9
+# Train SVM model
+svm_model = SVC(random_state=42)
+svm_model.fit(X_train, y_train)
+svm_pred = svm_model.predict(X_test)
+svm_accuracy = accuracy_score(y_test, svm_pred)
+print(f"SVM Accuracy: {svm_accuracy:.2%}")
+# END CELL9
+\`\`\`
+
+Texera UDF conversion:
+\`\`\`json
+{
+    "code": {
+        "UDF1": "# UDF1\nfrom pytexera import *\nimport pandas as pd\nfrom 
typing import Iterator, Optional\n\nclass 
ProcessTableOperator(UDFTableOperator):\n\n    @overrides\n    def 
process_table(self, table: Table, port: int) -> 
Iterator[Optional[TableLike]]:\n        # Remove duplicate rows\n        data = 
table.drop_duplicates()\n\n        # Remove rows with null values\n        data 
= data.dropna()\n\n        # Calculate statistics\n        min_values = 
data.min()\n        max_values = data.max()\n        mean_values = 
data.mean()\n\n        # Create a DataFrame to yield\n        result_table = 
pd.DataFrame({\n            'min_values': [min_values],\n            
'max_values': [max_values],\n            'mean_values': [mean_values],\n        
    'data': [data]\n        })\n\n        yield Table(result_table)",
+        "UDF2": "# UDF2\nfrom pytexera import *\nimport pandas as pd\nimport 
plotly.express as px\nimport plotly.io\nfrom typing import Iterator, 
Optional\n\nclass ProcessTableOperator(UDFTableOperator):\n    def 
render_error(self, error_msg):\n        return '''<h1>Boxplot is not 
available.</h1>\n                  <p>Reason is: {} </p>\n               
'''.format(error_msg)\n\n    @overrides\n    def process_table(self, table: 
Table, port: int) -> Iterator[Optional[TableLike]]:\n        data = 
table['data'].iloc[0]\n\n        if data.empty:\n            yield 
{'html-content': self.render_error('input table is empty.')}\n            
return\n\n        # Create a boxplot for the 'Pregnancies' field\n        fig = 
px.box(data, x='Pregnancies')\n        fig.update_layout(margin=dict(l=0, r=0, 
t=0, b=0))\n\n        # Convert fig to HTML content\n        html = 
plotly.io.to_html(fig, include_plotlyjs='cdn', auto_play=False)\n        yield 
{'html-content': html}",
+        "UDF3": "# UDF3\nfrom pytexera import *\nimport pandas as pd\nfrom 
sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing 
import StandardScaler\nfrom typing import Iterator, Optional\n\nclass 
ProcessTableOperator(UDFTableOperator):\n\n    @overrides\n    def 
process_table(self, table: Table, port: int) -> 
Iterator[Optional[TableLike]]:\n        data = table['data'].iloc[0]\n\n        
# Separate features and target variable\n        X = data.drop('Outcome', 
axis=1)\n        y = data['Outcome']\n\n        # Split data into training and 
testing sets (80% train, 20% test)\n        X_train, X_test, y_train, y_test = 
train_test_split(X, y, test_size=0.2, random_state=42)\n\n        scaler = 
StandardScaler()\n        X_train = scaler.fit_transform(X_train)\n        
X_test = scaler.transform(X_test)\n\n        # Create a DataFrame to yield\n    
    result_table = pd.DataFrame({\n            'X_train': [X_train], 'X_test': 
[X_test],\n            'y_train': [y_tra
 in], 'y_test': [y_test]\n        })\n\n        yield Table(result_table)",
+        "UDF4": "# UDF4\nfrom pytexera import *\nimport pandas as pd\nfrom 
sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import 
accuracy_score\nfrom typing import Iterator, Optional\n\nclass 
ProcessTableOperator(UDFTableOperator):\n\n    @overrides\n    def 
process_table(self, table: Table, port: int) -> 
Iterator[Optional[TableLike]]:\n        X_train = table['X_train'].iloc[0]\n    
    y_train = table['y_train'].iloc[0]\n        X_test = 
table['X_test'].iloc[0]\n        y_test = table['y_test'].iloc[0]\n\n        # 
Train Random Forest model\n        rf_model = 
RandomForestClassifier(random_state=42)\n        rf_model.fit(X_train, 
y_train)\n        rf_pred = rf_model.predict(X_test)\n        rf_accuracy = 
accuracy_score(y_test, rf_pred)\n\n        # Create a DataFrame to yield\n      
  result_table = pd.DataFrame({\n            'rf_model': [rf_model],\n          
  'rf_accuracy': [rf_accuracy],\n            'X_test': [X_test],\n            
'y_test': [y_test]\n 
        })\n\n        yield Table(result_table)",
+        "UDF5": "# UDF5\nfrom pytexera import *\nimport pandas as pd\nfrom 
sklearn.svm import SVC\nfrom sklearn.metrics import accuracy_score\nfrom typing 
import Iterator, Optional\n\nclass ProcessTableOperator(UDFTableOperator):\n\n  
  @overrides\n    def process_table(self, table: Table, port: int) -> 
Iterator[Optional[TableLike]]:\n        X_train = table['X_train'].iloc[0]\n    
    y_train = table['y_train'].iloc[0]\n        X_test = 
table['X_test'].iloc[0]\n        y_test = table['y_test'].iloc[0]\n\n        # 
Train SVM model\n        svm_model = SVC(random_state=42)\n        
svm_model.fit(X_train, y_train)\n        svm_pred = svm_model.predict(X_test)\n 
       svm_accuracy = accuracy_score(y_test, svm_pred)\n\n        # Create a 
DataFrame to yield\n        result_table = pd.DataFrame({\n            
'svm_model': [svm_model],\n            'svm_accuracy': [svm_accuracy],\n        
    'X_test': [X_test],\n            'y_test': [y_test]\n        })\n\n        
yield Table(result_tab
 le)"
+    },
+    "edges": [
+        ["UDF1", "UDF2"],
+        ["UDF1", "UDF3"],
+        ["UDF3", "UDF4"],
+        ["UDF3", "UDF5"]
+    ],
+    "outputs": {
+        "UDF1": ["min_values", "max_values", "mean_values", "data"],
+        "UDF2": ["html-content"],
+        "UDF3": ["X_train", "X_test", "y_train", "y_test"],
+        "UDF4": ["rf_model", "rf_accuracy", "X_test", "y_test"],
+        "UDF5": ["svm_model", "svm_accuracy", "X_test", "y_test"]
+    }
+}
+\`\`\`
+`;
+
+export const WORKFLOW_PROMPT = `You are an expert in Python coding and 
workflow systems.
+Many users of Texera system are non-technical, but the notebooks they provide 
are written by technical people.
+They want to convert their notebooks to Texera workflows.
+Your goal is to help convert these notebooks into a Texera workflow that 
non-technical users can use directly.
+So do not remove or modify any classes or functions, preserve their names and 
structure as they are.
+Ensure that all essential logic remains intact.
+Create multiple Texera UDF codes using the provided Python code.
+Number each UDF, starting at 1 and incrementing, by starting with a comment 
that states that UDF number.
+
+Use the class and function names as shown in ProcessTupleOperator, 
ProcessTableOperator, and ProcessBatchOperator.
+Do not change the class names, function names, or input parameters.
+Use the ones that make sense and split the code meaningfully as instructed.
+
+Use the starter code provided for Python UDFs.
+
+Use the documentation of Table, Tuple, or Batch to work with parameters within 
Texera UDF.
+Do not import other libraries to define these types.
+
+There is no need for an __init__ function. Assume all inputs are valid pandas 
DataFrames,
+so do not use .to_pandas(), .to_dataframe(), etc. Do not load data from a file 
in the first UDF, assume
+that the data is already given to you in the table parameter.
+Ensure proper data flow between functions. Separate operators as if they will 
run in different files.
+
+Current UDF operators can only have one output. Build a dataframe to yield all 
necessary variables
+and data. Ensure proper data flow for each UDF and all information is yielded 
(including training
+and testing data) if subsequent UDFs need them.
+
+Ensure all necessary imports are included in each UDF code block.
+
+Each UDF operator should be in its own Python code block. Do not combine them 
into a single block.
+Ensure import statements cover all used functions and separate them as 
necessary.
+
+It is VERY important that all of the original code in the Jupyter notebook is 
represented in the generated workflow.
+Make sure that nothing in the original is removed and that the semantic 
meaning of what the original code was doing is retained.
+If there are user-defined Python classes, include the entire class definition 
in the appropriate UDF(s) that use that class.
+Always include the code that defines the class inside of every distinct UDF 
that uses that constructs an object of that class.
+Python classes are allowed in Texera UDFs and follow the same semantics as 
standard Python.
+They can be defined outside of ProcessTableOperator, ProcessTupleOperator, and 
ProcessBatchOperator.
+
+Return only the JSON formatted response, do not give any explanation.
+Make sure the response is a valid JSON structure, including closing all braces 
and not including commas after the last element.
+Follow this JSON format (don't reuse the values, this is just the format). 
'code', 'edges', and 'outputs' are all their own key's, do not nest any of 
these in another one and make sure to close their braces:
+{
+"code": {
+"UDF1": "code for UDF1 goes here",
+"UDF2": "code for UDF2 goes here"
+},
+"edges": [
+["UDF1", "UDF2"]
+],
+"outputs": {
+"UDF1": ["min_values", "max_values", "mean_values", "data"],
+"UDF2": ["html-content"]
+}
+}
+Make sure only the keys in the code section appear in the edges and outputs 
sections. Do not include any extraneous fields.
+Do not include any extraneous UDF's in the code field that include empty 
strings.
+Give ALL of the code, do not omit anything or use placeholders for code. Make 
sure ALL code in the original is translated over.
+Use only unescaped single quotes inside of the code values for the UDF's, do 
not use escaped double quotes.
+Convert following the instructions and examples given. Here is the code:
+`;
+
+export const MAPPING_PROMPT = `
+Here is an example of a mapping generated between the given example Python 
code and the Texera UDFs using their CELL and UDF IDs. Cell IDs are designated 
by the UUID following '# START'. The format should be kept the same.
+{
+"UDF1": [
+"CEll3",
+"CELL4"
+],
+"UDF2": [
+"CELL5"
+],
+"UDF3": [
+"CELL6",
+"CELL7"
+]

Review Comment:
   The example JSON in `MAPPING_PROMPT` is invalid (missing a comma between the 
`UDF3` array and the `UDF4` key). Because `convertNotebookToWorkflow` does 
`JSON.parse` on the LLM output, this kind of invalid example can cause the 
model to mimic invalid JSON and break the flow; please fix the example to be 
valid JSON.
   ```suggestion
   ],
   ```



##########
frontend/src/app/workspace/service/notebook-migration/migration-llm.ts:
##########
@@ -0,0 +1,299 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { Injectable } from "@angular/core";
+import { firstValueFrom, from, Observable, of } from "rxjs";
+import { map } from "rxjs/operators";
+import { createOpenAI } from "@ai-sdk/openai";
+import { generateText, type ModelMessage } from "ai";
+import { AppSettings } from "../../../common/app-setting";
+import { v4 as uuidv4 } from "uuid";
+import {
+  TEXERA_OVERVIEW,
+  TUPLE_DOCUMENTATION,
+  TABLE_DOCUMENTATION,
+  OPERATOR_DOCUMENTATION,
+  UDF_INPUT_PORT_DOCUMENTATION,
+  EXAMPLE_OF_GOOD_CONVERSION,
+  VISUALIZER_DOCUMENTATION,
+  EXAMPLE_OF_MULTIPLE_UDF_CONVERSION,
+  WORKFLOW_PROMPT,
+  MAPPING_PROMPT,
+} from "./migration-prompts";
+
+interface Cell {
+  cell_type: string;
+  metadata: { [key: string]: any };
+  source: string;
+}
+
+export interface Notebook {
+  cells: Cell[];
+}
+
+interface WorkflowJSON {
+  operators: any[];
+  operatorPositions: Record<string, { x: number; y: number }>;
+  links: any[];
+  commentBoxes: any[];
+  settings: {
+    dataTransferBatchSize: number;
+  };
+}
+
+interface CombinedMapping {
+  operator_to_cell: Record<string, string[]>;
+  cell_to_operator: Record<string, string[]>;
+}
+
+@Injectable()
+export class NotebookMigrationLLM {
+  private model: any;
+  private messages: ModelMessage[] = [];
+  private initialized = false;
+
+  private static readonly DOCUMENTATION: string[] = [
+    TEXERA_OVERVIEW,
+    TUPLE_DOCUMENTATION,
+    TABLE_DOCUMENTATION,
+    OPERATOR_DOCUMENTATION,
+    EXAMPLE_OF_GOOD_CONVERSION,
+    VISUALIZER_DOCUMENTATION,
+    UDF_INPUT_PORT_DOCUMENTATION,
+    EXAMPLE_OF_MULTIPLE_UDF_CONVERSION,
+  ];
+
+  /**
+   * Initialize a new LLM session with Texera documentation
+   */
+  public initialize(modelType: string = "gpt-5-mini", apiKey: string = 
"dummy"): void {
+    this.model = createOpenAI({
+      baseURL: new URL(`${AppSettings.getApiEndpoint()}`, 
document.baseURI).toString(),
+      // apiKey is required by the library for creating the OpenAI compatible 
client;
+      // For security reason, we store the apiKey at the backend, thus the 
value is dummy here.
+      apiKey: apiKey,

Review Comment:
   `initialize` claims the API key is stored at the backend and should be a 
dummy value, but it actually passes the user-provided `apiKey` into the OpenAI 
client, which will typically be sent in an `Authorization` header on requests. 
If the intention is to keep keys server-side (or rely on the server’s master 
key), this should not forward the user’s key from the browser; otherwise, the 
backend/proxy needs an explicit, secure design for handling user-provided keys.
   ```suggestion
       void apiKey;
       this.model = createOpenAI({
         baseURL: new URL(`${AppSettings.getApiEndpoint()}`, 
document.baseURI).toString(),
         // apiKey is required by the library for creating the OpenAI 
compatible client;
         // For security reason, we store the apiKey at the backend, thus the 
value is dummy here.
         apiKey: "dummy",
   ```



##########
notebook-migration-service/src/main/scala/org/apache/texera/service/resource/NotebookMigrationResource.scala:
##########
@@ -0,0 +1,358 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.texera.service.resource
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import com.typesafe.scalalogging.LazyLogging
+import jakarta.ws.rs._
+import jakarta.ws.rs.core._
+import org.apache.texera.dao.SqlServer
+import org.jooq.JSONB
+import org.apache.texera.dao.jooq.generated.tables.Notebook
+import org.apache.texera.dao.jooq.generated.tables.WorkflowNotebookMapping
+import java.net.{HttpURLConnection, URL}
+import java.nio.charset.StandardCharsets
+import scala.util.control.NonFatal
+import org.apache.texera.amber.config.StorageConfig
+
+object NotebookMigrationResource extends LazyLogging {
+
+  private val mapper: ObjectMapper = new 
ObjectMapper().registerModule(DefaultScalaModule)
+  private val jupyterUrl = StorageConfig.jupyterURL
+  private var jupyterIframeURL = s"$jupyterUrl/notebooks/work/notebook.ipynb"
+
+  private def isJupyterAvailable(jupyterUrl: String): Boolean = {
+    try {
+      val conn = new java.net.URL(s"$jupyterUrl/api")
+        .openConnection()
+        .asInstanceOf[java.net.HttpURLConnection]
+
+      conn.setRequestMethod("GET")
+      conn.setConnectTimeout(2000)
+      conn.setReadTimeout(2000)
+
+      val status = conn.getResponseCode
+
+      status == 200 || status == 403
+    } catch {
+      case _: Exception => false
+    }
+  }
+
+  // Returns the Jupyter iframe reference URL
+  def getJupyterIframeURL(): Response = {
+    if (!isJupyterAvailable(jupyterUrl)) {
+      return Response
+        .status(500)
+        .entity(
+          """
+      {
+        "success": false,
+        "message": "Cannot connect to Jupyter server"
+      }
+      """
+        )
+        .build()
+    }
+
+    Response
+      .ok(
+        s"""
+    {
+      "success": true,
+      "url": "$jupyterIframeURL"
+    }
+    """
+      )
+      .build()
+  }
+
+  // Returns the URL of Jupyter
+  def getJupyterURL(): Response = {
+    if (!isJupyterAvailable(jupyterUrl)) {
+      return Response
+        .status(500)
+        .entity(
+          """
+      {
+        "success": false,
+        "message": "Cannot connect to Jupyter server"
+      }
+      """
+        )
+        .build()
+    }
+
+    Response
+      .ok(
+        s"""
+    {
+      "success": true,
+      "url": "$jupyterUrl"
+    }
+    """
+      )
+      .build()
+  }
+
+  // Set the notebook in Jupyter
+  def setNotebook(body: String): Response = {
+    if (!isJupyterAvailable(jupyterUrl)) {
+      return Response
+        .status(500)
+        .entity(
+          """
+      {
+        "success": false,
+        "message": "Cannot connect to Jupyter server"
+      }
+      """
+        )
+        .build()
+    }
+
+    try {
+      val json = mapper.readTree(body)
+
+      val notebookName = json.get("notebookName").asText()
+      val notebookData = json.get("notebookData")
+

Review Comment:
   `setNotebook` accepts an arbitrary `notebookName`, but later the iframe URL 
is always set to `.../notebook.ipynb` (hard-coded). This makes the API 
internally inconsistent if a different name is provided; either enforce a fixed 
name in the request contract or derive the iframe URL from the uploaded 
`notebookName`.



##########
frontend/src/app/workspace/component/menu/menu.component.html:
##########
@@ -456,3 +465,113 @@
     </div>
   </div>
 </div>
+
+<ng-template
+  #importNotebookModal
+  let-data>
+  <form
+    [formGroup]="importForm"
+    nz-form>
+    <div style="text-align: center; margin-bottom: 12px">
+      <img
+        ngSrc="assets/notebook_migration_tool/tool_popup_diagram.png"
+        alt="Notebook to Workflow"
+        width="1646"
+        height="479"
+        style="max-width: 100%; height: auto; border-radius: 8px" />
+    </div>
+
+    <nz-form-item>
+      <p style="margin: 0; font-size: 16px; font-weight: 500; user-select: 
text">
+        This tool converts a Python Jupyter Notebook into a Texera workflow 
using LLM capabilities. After you submit a
+        notebook, the LLM service will generate a corresponding Texera 
workflow. The conversion time depends on the
+        notebook’s complexity and can take 1–5 minutes. Once the process is 
complete, the workflow workspace will reload
+        with:
+      </p>
+      <ol
+        style="
+          margin-top: 4px;
+          margin-bottom: 0;
+          padding-left: 20px;
+          font-size: 16px;
+          font-weight: 500;
+          user-select: text;
+        ">
+        <li>
+          The generated workflow ready to use (Note: you will still need to 
upload the dataset and connect it to the
+          workflow).
+        </li>
+        <li>A floating Jupyter window containing the uploaded notebook for 
reference.</li>
+      </ol>
+      <p style="margin: 0; font-size: 16px; font-weight: 500; user-select: 
text">
+        Feel free to navigate away from this tab while you wait for the 
workflow to generate. Please do not close the
+        window.
+      </p>
+    </nz-form-item>
+
+    <nz-form-item>
+      <nz-form-label [nzNoColon]="true">
+        <span style="font-weight: 1000; font-size: 16px"> Upload Python 
Jupyter Notebook </span>
+      </nz-form-label>
+      <nz-form-control>
+        <div style="display: inline-flex; align-items: center; gap: 8px">
+          <nz-upload
+            [nzBeforeUpload]="beforeUpload"
+            [nzShowUploadList]="false">
+            <button
+              nz-button
+              style="white-space: normal">
+              <i
+                nz-icon
+                nzType="upload"></i>
+            </button>
+          </nz-upload>
+
+          <span *ngIf="importForm.get('file')?.value?.name">
+            Selected file: {{ importForm.get('file')?.value?.name }}
+          </span>
+        </div>
+      </nz-form-control>
+    </nz-form-item>
+
+    <nz-form-item>
+      <nz-form-label [nzNoColon]="true">
+        <span style="font-weight: 1000; font-size: 16px"> Select Model Type 
</span>
+      </nz-form-label>
+
+      <nz-form-control>
+        <ng-container *ngIf="data.models$ | async as models; else loadingTpl">
+          <nz-select
+            formControlName="model"
+            nzPlaceHolder="Select a model"
+            style="width: 50%">
+            <nz-option
+              *ngFor="let model of $any(models)"
+              [nzValue]="model.name"
+              [nzLabel]="model.name"></nz-option>
+          </nz-select>
+        </ng-container>
+
+        <ng-template #loadingTpl>
+          <nz-select
+            nzPlaceHolder="Loading models..."
+            [nzLoading]="true"
+            [nzDisabled]="true"
+            style="width: 50%"></nz-select>
+        </ng-template>
+      </nz-form-control>
+    </nz-form-item>
+
+    <nz-form-item>
+      <nz-form-label [nzNoColon]="true">
+        <span style="font-weight: 1000; font-size: 16px"> API Key </span>
+      </nz-form-label>
+      <nz-form-control>
+        <input
+          nz-input

Review Comment:
   The API key input is rendered as a plain text field. Since this value is 
sensitive, it should be a password-type input (and ideally disable 
autocomplete) to reduce accidental disclosure (screen sharing, browser autofill 
history, etc.).
   ```suggestion
             nz-input
             type="password"
             autocomplete="off"
   ```



##########
frontend/src/app/workspace/component/menu/menu.component.ts:
##########
@@ -550,6 +583,168 @@ export class MenuComponent implements OnInit, OnDestroy {
     this.workflowActionService.deleteOperatorsAndLinks(allOperatorIDs);
   }
 
+  openImportNotebookModal(): void {
+    const models$ = this.notebookMigrationService.getAvailableModels().pipe(
+      tap({
+        error: () => this.notificationService.error("Failed to fetch models"),
+      })
+    );
+
+    const modalRef = this.modal.create({
+      nzTitle: "AI Generate Workflow from Python Notebook",
+      nzContent: this.importModalTpl,
+      nzViewContainerRef: this.viewContainerRef,
+      nzWidth: 700,
+      nzData: {
+        models$: models$,
+      },
+      nzFooter: [
+        {
+          label: "Cancel",
+          onClick: () => {
+            modalRef.close();
+          },
+        },
+        {
+          label: "Submit",
+          type: "primary",
+          disabled: () => !this.importForm.valid,
+          onClick: () => {
+            const file: NzUploadFile = this.importForm.get("file")?.value;
+            const model: string = this.importForm.get("model")?.value;
+            const apiKey: string = this.importForm.get("apiKey")?.value;
+            this.onClickImportNotebook(file, model, apiKey);
+            modalRef.close(); // close after submit too
+          },
+        },
+      ],
+    });
+  }
+
+  public beforeUpload = (file: NzUploadFile) => {
+    this.importForm.patchValue({ file });
+    this.importForm.get("file")?.markAsDirty();
+    this.importForm.get("file")?.updateValueAndValidity();
+    return false; // prevent auto upload
+  };
+
+  public onClickImportNotebook = (file: NzUploadFile, model: string, apiKey: 
string): boolean => {
+    const reader = new FileReader();
+
+    // Check if the file is a Jupyter notebook based on its extension
+    const fileExtension = file.name.split(".").pop()?.toLowerCase();
+    if (fileExtension !== "ipynb") {
+      this.notificationService.error("Please upload a valid Jupyter Notebook 
(.ipynb) file.");
+      return false;
+    }
+
+    this.setWaitingForLLM.emit(true); // start loading
+
+    // Read the notebook file as text
+    reader.readAsText(file as any);
+    reader.onload = async () => {
+      try {
+        const result = reader.result;
+        if (typeof result !== "string") {
+          throw new Error("File content is not a valid string.");
+        }
+
+        // Parse the content of the .ipynb file (it's in JSON format)
+        const notebookContent = JSON.parse(result) as Notebook;
+
+        // Validate the notebook structure
+        if (!notebookContent || !Array.isArray(notebookContent.cells)) {
+          throw new Error("Invalid notebook structure.");
+        }
+
+        // Add UUID's to each cell in the notebook
+        for (const cell of notebookContent.cells) {
+          if (!cell.metadata) {
+            cell.metadata = {};
+          }
+          cell.metadata.uuid = uuidv4();
+        }
+
+        // Send Notebook JSON to pod to open in jupyterlab
+        await 
this.notebookMigrationService.sendNotebookToJupyter(notebookContent);
+
+        // Get workflow and mapping from LLM
+        await this.notebookMigrationService
+          .sendToAIGenerateWorkflow(notebookContent, model, apiKey)
+          .then(result => {
+            if (result) {
+              const { workflowContent, mappingContent } = result;
+
+              const fileExtensionIndex = file.name.lastIndexOf(".");
+              var workflowName: string;
+              if (fileExtensionIndex === -1) {
+                workflowName = file.name;
+              } else {
+                workflowName = file.name.substring(0, fileExtensionIndex);
+              }
+              if (workflowName.trim() === "") {
+                workflowName = DEFAULT_WORKFLOW_NAME;
+              }
+
+              const workflow: Workflow = {
+                content: workflowContent,
+                name: `${workflowName}_GENERATED_BY_LLM`,
+                isPublished: 0,
+                description: undefined,
+                wid: undefined,
+                creationTime: undefined,
+                lastModifiedTime: undefined,
+                readonly: false,
+              };
+
+              this.workflowPersistService
+                .persistWorkflow(workflow)
+                .pipe(
+                  switchMap((updatedWorkflow: Workflow) => {
+                    const mappingID = "mapping_wid_" + updatedWorkflow.wid;
+
+                    this.notebookMigrationService.setMapping(mappingID, 
mappingContent);
+
+                    return this.notebookMigrationService
+                      .storeNotebookAndMapping(updatedWorkflow.wid, 1, 
mappingContent, notebookContent)
+                      .pipe(map(() => updatedWorkflow));
+                  }),
+                  untilDestroyed(this)
+                )
+                .subscribe({
+                  next: updatedWorkflow => {
+                    this.workflowActionService.reloadWorkflow(updatedWorkflow, 
true);
+                    this.jupyterPanelService.openPanel("JupyterNotebookPanel");
+                    this.notificationService.success("Successfully generated 
workflow and mapping from notebook.");
+                  },
+                  error: (err: unknown) => {
+                    this.notificationService.error("Failed to import notebook, 
check console for detailed error");
+                    console.error("Import notebook failed:", err);
+                  },
+                  complete: () => {
+                    this.setWaitingForLLM.emit(false);
+                  },
+                });
+            } else {
+              console.error("Result is undefined");
+            }
+          })
+          .catch(error => {
+            this.notificationService.error("Error while communicating with 
LLM, check console for details");
+            console.error("Error while fetching data from LLM: ", error);
+          })
+          .finally(() => {
+            this.setWaitingForLLM.emit(false); // stop loading
+          });
+      } catch (error) {
+        this.notificationService.error("Failed to import the notebook.");
+        console.error(error);

Review Comment:
   If an error happens before the `sendToAIGenerateWorkflow(...).finally(...)` 
block is reached (e.g., JSON parse failure, invalid notebook structure, failure 
sending notebook to Jupyter), `setWaitingForLLM` will remain `true` and the 
spinner can get stuck. Add an outer `finally` (or emit `false` in the outer 
`catch`) so the loading state is always cleared.
   ```suggestion
             });
         } catch (error) {
           this.notificationService.error("Failed to import the notebook.");
           console.error(error);
         } finally {
           this.setWaitingForLLM.emit(false); // stop loading
   ```



##########
notebook-migration-service/src/main/resources/custom.js:
##########
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Use Jupyter's event system to ensure the notebook is fully loaded
+require(["base/js/events"], function (events) {
+  events.on("kernel_ready.Kernel", function () {
+
+    // Attach click event listener to cells
+    $("#notebook-container").on("click", ".cell", function (event) {
+      const cell = $(this);
+      const index = $(".cell").index(cell);
+      const cellContent = cell.find(".input_area").text();
+
+      // Get the UUID from the cell's metadata, or use "N/A" if it doesn't 
exist
+      const cellUUID = Jupyter.notebook.get_cell(index).metadata.uuid || 'N/A';
+
+      // Send a message to the parent window (Texera app)
+      window.parent.postMessage(
+        { action: "cellClicked", cellIndex: index, cellContent: cellContent, 
cellUUID: cellUUID },
+        "http://localhost:4200";
+      );

Review Comment:
   `postMessage` is hard-coded to target origin `http://localhost:4200`, and 
the receiver side also hard-codes that same origin. This will break in 
non-local deployments (different host/port) and is easy to misconfigure; please 
make the allowed/target origin configurable (or derive it safely from runtime 
config) rather than hard-coding localhost.



##########
notebook-migration-service/src/main/scala/org/apache/texera/service/resource/NotebookMigrationResource.scala:
##########
@@ -0,0 +1,358 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.texera.service.resource
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import com.typesafe.scalalogging.LazyLogging
+import jakarta.ws.rs._
+import jakarta.ws.rs.core._
+import org.apache.texera.dao.SqlServer
+import org.jooq.JSONB
+import org.apache.texera.dao.jooq.generated.tables.Notebook
+import org.apache.texera.dao.jooq.generated.tables.WorkflowNotebookMapping
+import java.net.{HttpURLConnection, URL}
+import java.nio.charset.StandardCharsets
+import scala.util.control.NonFatal
+import org.apache.texera.amber.config.StorageConfig
+
+object NotebookMigrationResource extends LazyLogging {
+
+  private val mapper: ObjectMapper = new 
ObjectMapper().registerModule(DefaultScalaModule)
+  private val jupyterUrl = StorageConfig.jupyterURL
+  private var jupyterIframeURL = s"$jupyterUrl/notebooks/work/notebook.ipynb"
+
+  private def isJupyterAvailable(jupyterUrl: String): Boolean = {
+    try {
+      val conn = new java.net.URL(s"$jupyterUrl/api")
+        .openConnection()
+        .asInstanceOf[java.net.HttpURLConnection]
+
+      conn.setRequestMethod("GET")
+      conn.setConnectTimeout(2000)
+      conn.setReadTimeout(2000)
+
+      val status = conn.getResponseCode
+
+      status == 200 || status == 403
+    } catch {
+      case _: Exception => false
+    }
+  }
+
+  // Returns the Jupyter iframe reference URL
+  def getJupyterIframeURL(): Response = {
+    if (!isJupyterAvailable(jupyterUrl)) {
+      return Response
+        .status(500)
+        .entity(
+          """
+      {
+        "success": false,
+        "message": "Cannot connect to Jupyter server"
+      }
+      """
+        )
+        .build()
+    }
+
+    Response
+      .ok(
+        s"""
+    {
+      "success": true,
+      "url": "$jupyterIframeURL"
+    }
+    """
+      )
+      .build()
+  }
+
+  // Returns the URL of Jupyter
+  def getJupyterURL(): Response = {
+    if (!isJupyterAvailable(jupyterUrl)) {
+      return Response
+        .status(500)
+        .entity(
+          """
+      {
+        "success": false,
+        "message": "Cannot connect to Jupyter server"
+      }
+      """
+        )
+        .build()
+    }
+
+    Response
+      .ok(
+        s"""
+    {
+      "success": true,
+      "url": "$jupyterUrl"
+    }
+    """
+      )
+      .build()
+  }
+
+  // Set the notebook in Jupyter
+  def setNotebook(body: String): Response = {
+    if (!isJupyterAvailable(jupyterUrl)) {
+      return Response
+        .status(500)
+        .entity(
+          """
+      {
+        "success": false,
+        "message": "Cannot connect to Jupyter server"
+      }
+      """
+        )
+        .build()
+    }
+
+    try {
+      val json = mapper.readTree(body)
+
+      val notebookName = json.get("notebookName").asText()
+      val notebookData = json.get("notebookData")
+
+      // Construct Jupyter API URL
+      val apiUrl = s"$jupyterUrl/api/contents/work/$notebookName"
+
+      val url = new URL(apiUrl)
+      val conn = url.openConnection().asInstanceOf[HttpURLConnection]
+
+      conn.setRequestMethod("PUT")
+      conn.setDoOutput(true)
+      conn.setRequestProperty("Content-Type", "application/json")
+
+      val requestBody =
+        s"""
+      {
+        "type": "notebook",
+        "content": $notebookData
+      }
+      """
+
+      val os = conn.getOutputStream
+      os.write(requestBody.getBytes(StandardCharsets.UTF_8))
+      os.flush()
+      os.close()
+
+      val status = conn.getResponseCode
+
+      if (status != 200 && status != 201) {
+        return Response
+          .status(500)
+          .entity(
+            s"""
+        {
+          "success": false,
+          "message": "Failed to upload notebook to Jupyter (status $status)"
+        }
+        """
+          )
+          .build()
+      }
+
+      jupyterIframeURL = s"$jupyterUrl/notebooks/work/notebook.ipynb"
+
+      Response
+        .ok(
+          s"""
+      {
+        "success": true,
+        "message": "Notebook successfully sent to Jupyter."
+      }
+      """
+        )
+        .build()
+
+    } catch {
+      case NonFatal(e) =>
+        logger.error("Error sending notebook to Jupyter", e)
+        Response
+          .status(Response.Status.INTERNAL_SERVER_ERROR)
+          .entity(s"""{"error":"${e.getMessage}"}""")
+          .build()
+    }
+  }
+
+  // Store notebook + mapping in database
+  def storeNotebookAndMapping(body: String): Response = {
+    try {
+      val json = mapper.readTree(body)
+
+      val wid: java.lang.Integer = json.get("wid").asInt()
+      val vid: java.lang.Integer = json.get("vid").asInt()
+      val mappingNode = json.get("mapping")
+      val notebookNode = json.get("notebook")
+
+      val dsl = SqlServer.getInstance().createDSLContext()
+
+      val nid: java.lang.Integer = SqlServer.withTransaction(dsl) { ctx =>
+        // Insert notebook
+        val notebookRecord = ctx
+          .insertInto(Notebook.NOTEBOOK)
+          .set(Notebook.NOTEBOOK.WID, wid)
+          .set(Notebook.NOTEBOOK.NOTEBOOK_, 
JSONB.valueOf(notebookNode.toString))
+          .returning(Notebook.NOTEBOOK.NID)
+          .fetchOne()
+
+        val nidInside: java.lang.Integer = 
notebookRecord.getValue(Notebook.NOTEBOOK.NID)
+
+        // Insert workflow-notebook mapping
+        ctx
+          .insertInto(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING)
+          .set(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.WID, wid)
+          .set(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.VID, vid)
+          .set(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.NID, 
nidInside)
+          .set(
+            WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.MAPPING,
+            JSONB.valueOf(mappingNode.toString)
+          )
+          .execute()
+
+        nidInside
+      }
+
+      Response
+        .ok(
+          s"""
+      {
+        "success": true,
+        "message": "Notebook and mapping successfully stored. wid: $wid, vid: 
$vid, nid: $nid"
+      }
+      """
+        )
+        .build()
+
+    } catch {
+      case NonFatal(e) =>
+        logger.error("Error storing mapping and workflow", e)
+        Response
+          .status(Response.Status.INTERNAL_SERVER_ERROR)
+          .entity(s"""{"error":"${e.getMessage}"}""")
+          .build()
+    }
+  }
+
+  // Fetch notebook + mapping
+  def fetchNotebookAndMapping(body: String): Response = {
+    try {
+      val json = mapper.readTree(body)
+
+      val wid: java.lang.Integer = json.get("wid").asInt()
+      val vid: java.lang.Integer = json.get("vid").asInt()
+
+      val dsl = SqlServer.getInstance().createDSLContext()
+
+      // Fetch the most recent notebook (highest nid) for this workflow version
+      val result = dsl
+        .select(
+          Notebook.NOTEBOOK.NID,
+          Notebook.NOTEBOOK.NOTEBOOK_,
+          WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.MAPPING
+        )
+        .from(Notebook.NOTEBOOK)
+        .join(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING)
+        
.on(Notebook.NOTEBOOK.WID.eq(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.WID))
+        
.and(Notebook.NOTEBOOK.NID.eq(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.NID))
+        .where(Notebook.NOTEBOOK.WID.eq(wid))
+        .and(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.VID.eq(vid))
+        .orderBy(Notebook.NOTEBOOK.NID.desc()) // most recent nid first
+        .limit(1) // only take the latest
+        .fetchOne()
+
+      if (result == null) {
+        Response.ok("""{"exists": false}""").build()
+      } else {
+        val nid: Int = result.getValue(Notebook.NOTEBOOK.NID)
+        val notebookJson: String =
+          result.get(Notebook.NOTEBOOK.NOTEBOOK_).asInstanceOf[JSONB].data()
+        val mappingJson: String = result
+          .get(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.MAPPING)
+          .asInstanceOf[JSONB]
+          .data()
+
+        Response
+          .ok(
+            s"""
+        {
+          "exists": true,
+          "notebook": $notebookJson,
+          "mapping": $mappingJson
+        }
+        """
+          )
+          .build()
+      }
+
+    } catch {
+      case NonFatal(e) =>
+        logger.error("Database error retrieving mapping", e)
+        Response
+          .status(Response.Status.INTERNAL_SERVER_ERROR)
+          .entity(s"""{"error":"${e.getMessage}"}""")
+          .build()
+    }
+  }
+}
+
+@Path("/notebook-migration")
+@Produces(Array(MediaType.APPLICATION_JSON))
+@Consumes(Array(MediaType.APPLICATION_JSON))
+class NotebookMigrationResource extends LazyLogging {
+
+  @GET
+  @Path("/get-jupyter-iframe-url")
+  def getJupyterIframeURL: Response = {
+    logger.info("Getting Jupyter iframe URL")
+    NotebookMigrationResource.getJupyterIframeURL()
+  }
+
+  @GET
+  @Path("/get-jupyter-url")
+  def getJupyterURL: Response = {
+    logger.info("Getting Jupyter API URL")
+    NotebookMigrationResource.getJupyterURL()
+  }
+
+  @POST
+  @Path("/set-notebook")
+  def setNotebook(body: String): Response = {
+    logger.info("Setting notebook, request body: " + body)
+    NotebookMigrationResource.setNotebook(body)

Review Comment:
   These endpoints log the full raw request body at INFO level. For 
notebooks/mappings this can be very large and may contain sensitive user data; 
please log only minimal metadata (e.g., wid/vid/nid, sizes) and avoid emitting 
the full JSON payload.



##########
notebook-migration-service/src/main/resources/docker-compose.yml:
##########
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: texera-jupyter
+services:
+
+  jupyter:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: texera-jupyter
+    ports:
+      - "9100:8888"
+    command: >
+      start-notebook.sh
+      --NotebookApp.token=''
+      --NotebookApp.password=''
+      --NotebookApp.disable_check_xsrf=True
+      --NotebookApp.tornado_settings="{'headers': {'Content-Security-Policy': 
'frame-ancestors http://localhost:*'}}"
+      --NotebookApp.default_url=/tree

Review Comment:
   The Jupyter container is started with token/password disabled and XSRF 
checks disabled. This makes the notebook server unauthenticated and vulnerable 
if it’s ever exposed beyond localhost; please keep secure defaults and make the 
insecure settings opt-in via env vars / a separate dev-only compose file.
   ```suggestion
         /bin/sh -c "start-notebook.sh
         --NotebookApp.tornado_settings=\"{'headers': 
{'Content-Security-Policy': 'frame-ancestors http://localhost:*'}}\"
         --NotebookApp.default_url=/tree
         $${JUPYTER_DEV_FLAGS:-}"
   ```



##########
frontend/src/app/workspace/component/jupyter-notebook-panel/jupyter-notebook-panel.component.ts:
##########
@@ -0,0 +1,97 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { Component, ElementRef, OnDestroy, OnInit, ViewChild, AfterViewInit } 
from "@angular/core";
+import { JupyterPanelService } from 
"../../service/jupyter-panel/jupyter-panel.service";
+import { from, of, Subject } from "rxjs";
+import { switchMap, takeUntil } from "rxjs/operators";
+import { DomSanitizer, SafeResourceUrl } from "@angular/platform-browser";
+import { NotebookMigrationService } from 
"../../service/notebook-migration/notebook-migration.service";
+
+@Component({
+  selector: "texera-jupyter-notebook-panel",
+  templateUrl: "./jupyter-notebook-panel.component.html",
+  styleUrls: ["./jupyter-notebook-panel.component.scss"],
+})
+export class JupyterNotebookPanelComponent implements OnInit, AfterViewInit, 
OnDestroy {
+  @ViewChild("iframeRef", { static: false }) iframeRef!: 
ElementRef<HTMLIFrameElement>; // Use static: false
+
+  isVisible: boolean = false; // Initialize to false, meaning the panel is 
hidden by default
+  jupyterUrl: SafeResourceUrl = ""; // Store the notebook URL dynamically
+  private destroy$ = new Subject<void>();
+
+  constructor(
+    private jupyterPanelService: JupyterPanelService,
+    private sanitizer: DomSanitizer,
+    private notebookMigrationService: NotebookMigrationService
+  ) {}
+
+  ngOnInit(): void {
+    this.jupyterPanelService.jupyterNotebookPanelVisible$
+      .pipe(
+        switchMap((visible: boolean) => {
+          this.isVisible = visible;
+
+          if (!visible) {
+            return of(null);
+          }
+
+          return from(this.notebookMigrationService.getJupyterIframeURL());
+        }),
+        takeUntil(this.destroy$)
+      )
+      .subscribe(url => {
+        if (url) {
+          this.jupyterUrl = this.sanitizer.bypassSecurityTrustResourceUrl(url);
+          this.checkIframeRef();
+        }
+      });
+  }
+
+  ngAfterViewInit(): void {
+    // Ensure iframe is handled after it's available in the DOM
+    this.checkIframeRef();
+  }
+
+  checkIframeRef(): void {
+    setTimeout(() => {
+      if (this.isVisible && this.iframeRef?.nativeElement) {
+        this.jupyterPanelService.setIframeRef(this.iframeRef.nativeElement);
+      } else {
+        console.error("Jupyter Iframe reference not found.");
+      }

Review Comment:
   `checkIframeRef` logs `console.error` when the panel is not visible / iframe 
is not rendered yet, but it’s called from `ngAfterViewInit` even when hidden. 
This will produce noisy errors on normal startup; only log when the panel is 
expected to be visible and the iframe ref is still missing.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat: add the Python Notebook Migration Tool [texera]

Reply via email to